In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from tensorflow import keras

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

import tf_keras
from tf_keras.utils import to_categorical
from tf_keras.layers import Dense, Dropout, Embedding, BatchNormalization, GlobalAveragePooling1D, GRU
from tf_keras.callbacks import ReduceLROnPlateau
from tf_keras.optimizers import Adam
from tf_keras.losses import SparseCategoricalCrossentropy

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
from google.colab import drive
drive.mount('/content/drive')


# Data path.
DATA_PATH = '/content/drive/MyDrive/epita/natural-language-processing/NLP_exam_emotions_dataset'
SAVE_PATH = '/content/drive/MyDrive/epita/natural-language-processing/models'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
def load_data(file_path):
    texts = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split(';')
            if len(parts) == 2:
                text, label = parts
                texts.append(text)
                labels.append(label)
    return texts, labels

## Data Prepocessing

In [4]:
TRAIN_FILE = 'train.txt'
TEST_FILE = 'test.txt'
VAL_FILE = 'validation.txt'

train_texts, train_labels = load_data('/'.join([DATA_PATH, TRAIN_FILE]))
test_texts, test_labels = load_data('/'.join([DATA_PATH, TEST_FILE]))
val_texts, val_labels = load_data('/'.join([DATA_PATH, VAL_FILE]))

In [5]:
# Tokenize text
EMBEDED_LEN = 256
WORD_NUM = 10000
tokenizer = Tokenizer(num_words=WORD_NUM, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)
val_tok = tokenizer.texts_to_sequences(val_texts)
val_padded = pad_sequences(val_tok, maxlen=EMBEDED_LEN)

In [6]:
label_encoder = LabelEncoder()

# Combine train and test labels to fit the encoder on all possible labels
all_labels = train_labels + test_labels
label_encoder.fit(all_labels)

# Transform train and test labels separately
val_labels_encoded = label_encoder.transform(val_labels)

# Convert encoded labels to categorical one-hot encoding
val_labels_categorical = to_categorical(val_labels_encoded, num_classes=len(label_encoder.classes_))

# Get number of classes
num_classes = len(label_encoder.classes_)

### Preperation for RoBERTa

In [7]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

model_name = "roberta-base"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)

val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors="tf")
val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels_encoded
)).batch(32)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


## Load Model

In [8]:
fcnn_model_path = 'sentiment_model_fcnn.keras'
gru_model_path = 'sentiment_model_gru.keras'
roberta_model_path = 'sentiment_model_roberta.keras'

In [9]:
try:
    fcnn_model = tf_keras.models.load_model('/'.join([SAVE_PATH, fcnn_model_path]))
    gru_model = tf_keras.models.load_model('/'.join([SAVE_PATH, gru_model_path]))
    # Load the RoBERTa model using TFAutoModelForSequenceClassification
    roberta_model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
    roberta_model.load_weights('/'.join([SAVE_PATH, roberta_model_path])) # Load the saved weights

except Exception as e:
    print(f"Error loading models: {e}")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifie

## Model Evaluation

In [10]:
def metrics_eval(model, is_tf_dataset, X_test, y_test=None):
    if is_tf_dataset:
        y_pred_output = model.predict(X_test)
        # Access the logits from the TFSequenceClassifierOutput object
        y_pred_logits = y_pred_output.logits
        # Apply softmax to convert logits to probabilities
        y_pred_probs = np.exp(y_pred_logits) / np.sum(np.exp(y_pred_logits), axis=1, keepdims=True)
        y_pred = np.argmax(y_pred_probs, axis=1)

        y_true = []
        for _, label in X_test.unbatch().as_numpy_iterator():
            y_true.append(label)
        y_true = np.array(y_true)

    else:
        y_pred_probs = model.predict(X_test)
        y_pred = np.argmax(y_pred_probs, axis=1)
        # Convert one-hot encoded y_test to single-label format
        y_true = np.argmax(y_test, axis=1)


    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')  # Use 'weighted' for multi-class
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    return accuracy, precision, recall, f1

In [11]:
fcnn_acc, fcnn_prec, fcnn_rec, fcnn_f1 = metrics_eval(fcnn_model, False, val_padded, val_labels_categorical)
gru_acc, gru_prec, gru_rec, gru_f1 = metrics_eval(gru_model, False, val_padded, val_labels_categorical)
roberta_acc, roberta_prec, roberta_rec, roberta_f1 = metrics_eval(roberta_model, True, val_dataset)



In [12]:
print(f'FCNN Accuracy: {fcnn_acc}')
print(f'FCNN Precision: {fcnn_prec}')
print(f'FCNN Recall: {fcnn_rec}')
print(f'FCNN F1 Score: {fcnn_f1}')

FCNN Accuracy: 0.735
FCNN Precision: 0.8018535795490958
FCNN Recall: 0.7343799931679248
FCNN F1 Score: 0.7288806869998021


In [13]:
print(f'GRU Accuracy: {gru_acc}')
print(f'GRU Precision: {gru_prec}')
print(f'GRU Recall: {gru_rec}')
print(f'GRU F1 Score: {gru_f1}')

GRU Accuracy: 0.927
GRU Precision: 0.9008393076257072
GRU Recall: 0.8940518644762308
GRU F1 Score: 0.8971168995686805


In [14]:
print(f'RoBERTa Accuracy: {roberta_acc}')
print(f'RoBERTa Precision: {roberta_prec}')
print(f'RoBERTa Recall: {roberta_rec}')
print(f'RoBERTa F1 Score: {roberta_f1}')

RoBERTa Accuracy: 0.94
RoBERTa Precision: 0.9140437177777722
RoBERTa Recall: 0.9170112955079418
RoBERTa F1 Score: 0.9152320712888073
