In [1]:
import pandas as pd

df = pd.read_csv('use_this_data/llm.csv')

df.head()

Unnamed: 0,category,text
0,arts,rob delaney vir das galen hopper samson kayo g...
1,arts,andris nelsons conducts a joint concert of the...
2,arts,warner music group has brought on sherry tan t...
3,arts,adele will explore what she s been going throu...
4,arts,you are using an older browser version. please...


In [6]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['label_encoded'] = encoder.fit_transform(df['category'])
class_names = encoder.classes_


df.head()

Unnamed: 0,category,text,label_encoded
0,arts,rob delaney vir das galen hopper samson kayo g...,0
1,arts,andris nelsons conducts a joint concert of the...,0
2,arts,warner music group has brought on sherry tan t...,0
3,arts,adele will explore what she s been going throu...,0
4,arts,you are using an older browser version. please...,0


In [8]:
from transformers import AutoTokenizer
import tensorflow as tf

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

encoded = tokenizer(
    df["text"].tolist(),
    padding=True,
    truncation=True,
    max_length=128,
    return_tensors="tf"
)

input_ids = encoded["input_ids"]
attention_mask = encoded["attention_mask"]
labels_tf = tf.constant(df["label_encoded"].tolist(), dtype=tf.int32)


In [9]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from transformers import TFAutoModelForSequenceClassification
import numpy as np

model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(class_names)  # number of distinct classes
)

optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

model.compile(
    optimizer=optimizer,
    loss=loss_fn,
    metrics=["accuracy"]
)

#######################
# 5) Train the model on GPU (if available)
#######################
print("GPUs available:", tf.config.list_physical_devices('GPU'))

model.fit(
    x={
        "input_ids": input_ids,
        "attention_mask": attention_mask
    },
    y=labels_tf,
    epochs=2,
    batch_size=2
)

#######################
# 6) Evaluate / get predictions
#######################
pred_logits = model.predict({
    "input_ids": input_ids,
    "attention_mask": attention_mask
}).logits

pred_label_ids = np.argmax(pred_logits, axis=1)  # integer predictions

# True labels
y_true = df["label_id"].to_numpy()

#######################
# 7) Calculate accuracy and per-class metrics
#######################
overall_accuracy = accuracy_score(y_true, pred_label_ids)
print(f"\n=== Overall Accuracy: {overall_accuracy:.4f} ===")

report = classification_report(
    y_true,
    pred_label_ids,
    target_names=class_names
)
print("\n=== Classification Report ===")
print(report)

# (Optional) Confusion Matrix
cm = confusion_matrix(y_true, pred_label_ids)
print("=== Confusion Matrix ===\n", cm)

# Convert confusion matrix to DataFrame for readability
cm_df = pd.DataFrame(cm, index=class_names, columns=class_names)
print("\nConfusion Matrix (with labels):")
print(cm_df)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPUs available: []
Epoch 1/2


KeyboardInterrupt: 

In [65]:
from sklearn.model_selection import KFold
import keras_nlp
import numpy as np
import tensorflow as tf

from keras_hub.src.models.bert import bert_backbone
import tensorflow as tf
import tensorflow_hub as hub
import keras_nlp

# Load the BERT preprocessing and encoder models from TensorFlow Hub
bert_preprocess = keras_nlp.models.BertPreprocessor.from_preset("bert_small_en_uncased",trainable=True)
bert_encoder = keras_nlp.models.BertBackbone.from_preset("bert_small_en_uncased", load_weights=True)


# Define the input layer
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')

# Preprocess the text input
preprocessed_text = bert_preprocess(text_input)

# Feed the preprocessed text to the BERT encoder
outputs = bert_encoder(preprocessed_text)
pooled_output = outputs["pooled_output"]

# Dropout + multi-class Dense
x = tf.keras.layers.Dropout(0.3)(pooled_output)
# 18 classes => softmax
x = tf.keras.layers.Dense(18, activation="softmax", kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)


# Add a dense layer with sigmoid activation for binary classification
# l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Define the final model
model = tf.keras.Model(inputs=text_input, outputs=x)

# Print the model summary
model.summary()

# k = 5
# kf = KFold(n_splits=k, shuffle=True, random_state=42)
# 
# # Get the index of the "other" category in encoded labels
# other_class_index = df[df['category'] == "other"]['label_encoded'].unique()[0]
# 
# # Store results
# fold_accuracies = []
# 
# for fold, (train_idx, val_idx) in enumerate(kf.split(X, y)):
#     print(f"\nFold {fold+1}/{k}")
# 
#     # Split data
#     X_train, X_val = X[train_idx], X[val_idx]
#     y_train, y_val = y[train_idx], y[val_idx]
# 
#     # Reinitialize a new model for each fold (important to prevent carry-over effects)
#     bert_preprocess = keras_nlp.models.BertPreprocessor.from_preset("bert_small_en_uncased", trainable=True)
#     bert_encoder = keras_nlp.models.BertBackbone.from_preset("bert_small_en_uncased", load_weights=True)
# 
#     text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
#     preprocessed_text = bert_preprocess(text_input)
#     outputs = bert_encoder(preprocessed_text)
#     pooled_output = outputs["pooled_output"]
# 
#     x = tf.keras.layers.Dropout(0.3)(pooled_output)
#     x = tf.keras.layers.Dense(18, activation="softmax", kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)
# 
#     # Define the final model
#     model = tf.keras.Model(inputs=text_input, outputs=x)
# 
#     model.compile(
#         optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
#         loss='sparse_categorical_crossentropy',
#         metrics=['accuracy']
#     )
# 
#     # Train model on fold data
#     history = model.fit(
#         X_train, y_train,
#         validation_data=(X_val, y_val),
#         epochs=8, 
#         batch_size=16,
#         verbose=1
#     )
# 
#     # Evaluate model on validation set
#     val_loss, val_accuracy = model.evaluate(X_val, y_val)
#     print(f"Fold {fold+1} Accuracy: {val_accuracy:.4f}")
#     
#     # Store accuracy for averaging later
#     fold_accuracies.append(val_accuracy)
# 
# 
# # Compute average accuracy across folds
# mean_accuracy = np.mean(fold_accuracies)
# std_accuracy = np.std(fold_accuracies)
# print(f"\nMean Accuracy: {mean_accuracy:.4f} ± {std_accuracy:.4f}")
# 
# 
# def predict_with_threshold(model, texts, threshold=0.5):
#     """
#     Predicts labels for given texts. If the model's highest confidence score 
#     is below `threshold`, assigns the "other" category.
#     """
#     probs = model.predict(texts)
#     max_probs = np.max(probs, axis=1)  # Get highest probability per sample
#     predictions = np.argmax(probs, axis=1)  # Get class with highest probability
# 
#     # Assign "Other" class if max probability is below threshold
#     for i, prob in enumerate(max_probs):
#         if prob < threshold:
#             predictions[i] = other_class_index  # Use the correct label for "Other"
# 
#     return predictions

In [68]:
import numpy as np

# METRICS = [
#       tf.keras.metrics.BinaryAccuracy(name='accuracy'),
#       tf.keras.metrics.Precision(name='precision'),
#       tf.keras.metrics.Recall(name='recall')
# ]
# 
# model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=METRICS)
# 
# model.fit(X_train, y_train, epochs=10)
# 
# model.evaluate(X_test, y_test)


# X_train = X_train.astype(str).to_numpy()  # shape = (num_samples,)
# y_train = y_train.astype(np.int32).to_numpy()  # shape = (num_samples,)
# 
# X_test = X_test.astype(str).to_numpy()
# y_test = y_test.astype(np.int32).to_numpy()
# 
# # Now train
# model.compile(
#     optimizer='adam',
#     loss='binary_crossentropy',
#     metrics=[
#         tf.keras.metrics.BinaryAccuracy(name='accuracy'),
#         tf.keras.metrics.Precision(name='precision'),
#         tf.keras.metrics.Recall(name='recall')
#     ]
# )
# 
# model.fit(X_train, y_train, epochs=10)
# model.evaluate(X_test, y_test)
# 
# model.fit(
#     X_train,
#     y_train,
#     validation_split=0.2,     # or use a separate X_val, y_val
#     epochs=10,
#     batch_size=32,
#     callbacks=[
#         tf.keras.callbacks.EarlyStopping(
#             monitor="val_loss", 
#             patience=3, 
#             restore_best_weights=True
#         )
#     ]
# )
# 
# # 5) Evaluate
# model.evaluate(X_test, y_test)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='sparse_categorical_crossentropy',  # matches integer labels
    metrics=['accuracy']
)

model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=32
)

model.evaluate(X_test, y_test)

Epoch 1/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1953s[0m 18s/step - accuracy: 0.8459 - loss: 0.9116 - val_accuracy: 0.5313 - val_loss: 2.0107
Epoch 2/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m682s[0m 6s/step - accuracy: 0.8772 - loss: 0.8060 - val_accuracy: 0.5277 - val_loss: 2.0733
Epoch 3/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m774s[0m 7s/step - accuracy: 0.9008 - loss: 0.7272 - val_accuracy: 0.5277 - val_loss: 2.1262
Epoch 4/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m711s[0m 7s/step - accuracy: 0.9221 - loss: 0.6622 - val_accuracy: 0.5360 - val_loss: 2.1604
Epoch 5/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m745s[0m 7s/step - accuracy: 0.9387 - loss: 0.5862 - val_accuracy: 0.5372 - val_loss: 2.1539
Epoch 6/10
[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m727s[0m 7s/step - accuracy: 0.9744 - loss: 0.5040 - val_accuracy: 0.5407 - val_loss: 2.2595
Epoch 7/10
[1m106/1

[2.4405837059020996, 0.5533522367477417]