<a href="https://colab.research.google.com/github/joycerlz/bigfive-text-classification/blob/main/DTB_multiclass_ocean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## DistilBERT for Multiclass Text Classification
Using the oceans dataset

## Import libraries and dataset

In [None]:
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification

import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.regularizers import l2

from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('ocean_longer.csv')
df.head()

In [None]:
def drop_long_texts(df):
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    df_filtered = df[df['word_count'] <= 512]
    df_filtered.drop(columns=['word_count'], inplace=True)

    return df_filtered

def balance_dataset(df):
    # Group the DataFrame by 'label' and sample approximately 600 rows from each group
    df_balanced = df.groupby('labels', group_keys=False).apply(lambda x: x.sample(min(len(x), 1000)))

    return df_balanced

In [None]:
df = drop_long_texts(df)
df = balance_dataset(df)

In [None]:
df['labels'].value_counts()

In [None]:
df.shape

# Split to train, validation, and test; tokenize

In [None]:
data_texts = df['text'].to_list()
data_labels = df['labels'].to_list()

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(val_texts, val_labels, test_size=0.5, random_state=42)

In [None]:
print("Train set size:", len(train_texts))
print("Validation set size:", len(val_texts))
print("Test set size:", len(test_texts))

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512, return_tensors="tf")

train_dataset = tf.data.Dataset.from_tensor_slices( (dict(train_encodings), train_labels) )
val_dataset = tf.data.Dataset.from_tensor_slices( (dict(val_encodings), val_labels) )
test_dataset = tf.data.Dataset.from_tensor_slices( (dict(test_encodings), test_labels) )

train_dataset = train_dataset.batch(32)
val_dataset = val_dataset.batch(32)

In [None]:
print(train_dataset)

<_BatchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int32, name=None)}, TensorSpec(shape=(None,), dtype=tf.int32, name=None))>


# Fine tune the model

In [None]:
base_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

In [None]:
input_layer = Input(shape=(None,), dtype=tf.int32, name='input_ids')
attention_mask_layer = Input(shape=(None,), dtype=tf.int32, name='attention_mask')

distilbert_output = base_model.distilbert([input_layer, attention_mask_layer])[0][:, 0, :]
output = Dropout(0.1)(distilbert_output)
output = Dense(128, activation='relu', kernel_regularizer=l2(0.01))(output)
output = Dropout(0.1)(output)
output = Dense(5, activation='softmax')(output)

# Create more complexd model
com_model = Model(inputs=[input_layer, attention_mask_layer], outputs=output)
com_model.summary()

In [None]:
early_stopping = EarlyStopping(monitor="val_loss", patience=4, mode="min", restore_best_weights=True)
com_model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
com_model.optimizer.learning_rate.assign(3e-5)

<tf.Variable 'UnreadVariable' shape=() dtype=float32, numpy=3e-05>

In [None]:
com_model_history = com_model.fit(train_dataset, validation_data=val_dataset, epochs=50, callbacks=[early_stopping])

## Evaluate

In [None]:
def plotAccuracy(model_history):
  plt.plot(model_history.history['accuracy'], label= 'accuracy')
  plt.plot(model_history.history['val_accuracy'], label='val_accuracy')
  plt.gca().yaxis.set_major_formatter(PercentFormatter(1))
  plt.xlabel('Epoch')
  plt.ylabel('Accuracy')
  plt.title('Training Accuracy')
  plt.legend()

plotAccuracy(com_model_history)

In [None]:
def plotLoss(model_history):
  plt.plot(model_history.history['loss'], label= 'loss')
  plt.plot(model_history.history['val_loss'], label='val_loss')
  plt.xlabel('Epoch')
  plt.ylabel('Cross-Entropy Loss')
  plt.title('Training Loss')
  plt.legend()

plotLoss(com_model_history)

In [None]:
y_pred = []
for batch in test_dataset:
    input_ids = batch[0]['input_ids'][tf.newaxis, :]
    attention_mask = batch[0]['attention_mask'][tf.newaxis, :]
    batch_output = com_model.predict({'input_ids': input_ids, 'attention_mask': attention_mask})[0]
    batch_pred = np.argmax(batch_output, axis=-1)
    y_pred.append(batch_pred.item())

y_true = np.array(test_labels)
print(classification_report(y_true, y_pred))

In [None]:
def plot_confusion_matrix(y_pred, y_true):
  labels = ["agreeable","extraversion","openness","conscientiousness","neuroticism"]
  y_pred_np = np.array(y_pred)
  y_true_np = np.array(y_true)

  cm = confusion_matrix(y_true_np, y_pred_np, normalize="true", labels=range(len(labels)))
  _, ax = plt.subplots(figsize=(5, 5))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot(cmap="Blues", values_format=".2f", ax=ax)
  plt.title("Confusion Matrix")
  ax.set_xticklabels(labels, rotation=25, ha="right")
  plt.show()

plot_confusion_matrix(y_pred, test_labels)

# Save the model

In [None]:
com_model.save('distilbert_ocean')
tokenizer.save_pretrained('distilbert_ocean/tokenizer')

# Loading Pre-Trained Model

In [None]:
# Load the model and tokenizer
model_v = tf.keras.models.load_model('distilbert_ocean')
tokenizer_v = DistilBertTokenizer.from_pretrained('distilbert_ocean/tokenizer')

In [None]:
# sample test
# example = "I prefer not going out to crowded social events."
example = "I find it easy to trust others and believe in the inherent goodness of people."

example_en = tokenizer_v(example, truncation=True, padding=True, return_tensors='tf')
# output = model_v(example_en)[0]

input_ids = example_en['input_ids']
attention_mask = example_en['attention_mask']

# Perform inference
output = model_v.predict({'input_ids': input_ids, 'attention_mask': attention_mask})

# Get predicted probabilities
prob = tf.nn.softmax(output[0], axis=-1).numpy().tolist()
print(prob)
predicted_probabilities = tf.nn.softmax(output[0], axis=-1).numpy()
print("Predicted Probabilities:", predicted_probabilities)

predicted_class = np.argmax(predicted_probabilities)
print("Predicted Class:", predicted_class)

## Test with example conversation

In [None]:
conversation = ["I’m tired all the time, no matter how much sleep I get.",
                "Sometimes I just want to disappear and not exist anymore.",
                "It’s hard to concentrate or focus on anything.",
                "I feel like I’m a burden to everyone around me.",
                "Everything feels pointless and meaningless.",
                "It’s like there’s a constant weight on my chest that won’t go away."]

In [None]:
train_en = tokenizer_v(conversation, truncation=True, padding=True, return_tensors='tf')
output = model_v(train_en)[0]
predictions = np.argmax(output, axis=1)
print(predictions)

[2 0 2 0 2 0]


In [None]:
print(type(predictions))

<class 'numpy.ndarray'>
