In [11]:
# !pip3 install -r requirements.txt

Collecting tensorflow>=2.17.1 (from -r requirements.txt (line 14))
  Downloading tensorflow-2.18.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow>=2.17.1->-r requirements.txt (line 14))
  Downloading tensorflow_intel-2.18.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting tensorboard<2.19,>=2.18 (from tensorflow-intel==2.18.0->tensorflow>=2.17.1->-r requirements.txt (line 14))
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tensorflow-2.18.0-cp312-cp312-win_amd64.whl (7.5 kB)
Downloading tensorflow_intel-2.18.0-cp312-cp312-win_amd64.whl (390.3 MB)
   ---------------------------------------- 0.0/390.3 MB ? eta -:--:--
   ---------------------------------------- 4.2/390.3 MB 22.9 MB/s eta 0:00:17
   - -------------------------------------- 10.0/390.3 MB 24.8 MB/s eta 0:00:16
   - -------------------------------------- 15.5/390.3 MB 25.6 MB/s eta 0:00:15
   -- ------------------------------------- 21.0

  You can safely remove it manually.

[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# Import necessary libraries

# Data manipulation
import pandas as pd
import numpy as np
from datetime import datetime 

# Statistical functions
from scipy.stats import zscore

# Text Preprocessing and NLP
import nltk
# Stopwords (common words to ignore) from NLTK
from nltk.corpus import stopwords

# Tokenizing sentences/words
from nltk.corpus import wordnet

# Tokenizing sentences/words
from nltk.tokenize import word_tokenize
# Lemmatization (converting words to their base form)
from nltk.stem import WordNetLemmatizer


# For generating n-grams
from nltk.util import ngrams
from collections import Counter

## Data Preparation (Loading CSV)

Load the three CSV files into a pandas DataFrame `data`.

In [None]:
data = pd.read_csv('../final_df.csv')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11518 entries, 0 to 11517
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   year                   11518 non-null  int64 
 1   month                  11518 non-null  int64 
 2   sentiment              11518 non-null  object
 3   processed_full_review  11518 non-null  object
dtypes: int64(2), object(2)
memory usage: 360.1+ KB


In [5]:
data['sentiment'].value_counts()

sentiment
Positive    7913
Negative    2441
Neutral     1164
Name: count, dtype: int64

In [6]:
data['year'].value_counts()

year
2019    5129
2018    2596
2022    1184
2023    1111
2020     888
2024     514
2021      96
Name: count, dtype: int64

# Basic RNN + Tokenizer Self-Trained Embedding Layer

In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
import numpy as np
import random
import os

# Set to CPU only
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable GPU

# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Parameters
vocab_size = 5000         # Limit vocabulary to 5000 words
embedding_dim = 128       # Embedding dimensions for each word
max_sequence_length = 300 # Max number of words in each sequence
l2_lambda = 0.01 

# Step 1: Tokenize and Pad the Text
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(data['processed_full_review'])
sequences = tokenizer.texts_to_sequences(data['processed_full_review'])
X_padded = pad_sequences(sequences, maxlen=max_sequence_length)

# Labels
sentiment_dict = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
y = data['sentiment'].map(sentiment_dict).values

# Calculate class weights
class_weights_values = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights = {i: class_weights_values[i] for i in range(len(class_weights_values))}

# Define stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

# Cross-validation loop
for fold, (train_index, test_index) in enumerate(skf.split(X_padded, y)):
    print(f"\nTraining fold {fold + 1}...\n")
    
    X_train, X_test = X_padded[train_index], X_padded[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Define the model architecture
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length, trainable=True))
    model.add(SimpleRNN(64, activation='tanh', kernel_regularizer=tf.keras.regularizers.l2(l2_lambda)))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(l2_lambda)))
    
    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    # Train the model with early stopping and class weights
    model.fit(
        X_train, y_train, 
        epochs=10, 
        batch_size=128,  
        validation_split=0.2, 
        verbose=1,
        callbacks=[early_stopping],
        class_weight=class_weights
    )
    
    # Predictions and evaluation for the current fold
    y_pred_prob = model.predict(X_test)
    y_pred = np.argmax(y_pred_prob, axis=1)
    
    # Calculate metrics for the current fold
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive'], zero_division=0, output_dict=True)
    f1 = report['weighted avg']['f1-score']
    
    accuracy_scores.append(accuracy)
    f1_scores.append(f1)
    
    print(f"Fold {fold + 1} Accuracy: {accuracy:.4f}")
    print(f"Fold {fold + 1} F1 Score: {f1:.4f}")
    print(f"Fold {fold + 1} Classification Report:\n", classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive'], zero_division=0, digits=4))

# Print average metrics across all folds
print("\nAverage Metrics across folds:")
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")


2024-11-09 23:38:24.371102: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-09 23:38:24.424690: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1731166704.446436  293354 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1731166704.452657  293354 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-09 23:38:24.507969: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr


Training fold 1...

Epoch 1/10


2024-11-09 23:38:26.428189: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - accuracy: 0.4357 - loss: 1.8302 - val_accuracy: 0.3956 - val_loss: 1.6660
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.7336 - loss: 1.2637 - val_accuracy: 0.7292 - val_loss: 1.0412
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.8249 - loss: 0.9020 - val_accuracy: 0.7727 - val_loss: 0.8710
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.8206 - loss: 0.8082 - val_accuracy: 0.7477 - val_loss: 0.8637
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.9028 - loss: 0.6364 - val_accuracy: 0.6262 - val_loss: 0.9345
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.9344 - loss: 0.4802 - val_accuracy: 0.7157 - val_loss: 0.8837
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - accuracy: 0.5010 - loss: 1.7917 - val_accuracy: 0.4645 - val_loss: 1.5147
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.7643 - loss: 1.2497 - val_accuracy: 0.6180 - val_loss: 1.1050
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.8617 - loss: 0.8723 - val_accuracy: 0.6348 - val_loss: 1.0092
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9111 - loss: 0.6058 - val_accuracy: 0.6782 - val_loss: 0.9567
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9131 - loss: 0.5440 - val_accuracy: 0.6663 - val_loss: 1.0103
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9268 - loss: 0.4909 - val_accuracy: 0.6647 - val_loss: 0.9894
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 36ms/step - accuracy: 0.5040 - loss: 1.7974 - val_accuracy: 0.5952 - val_loss: 1.3694
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.8048 - loss: 1.1338 - val_accuracy: 0.6826 - val_loss: 1.0879
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.8575 - loss: 0.8042 - val_accuracy: 0.6576 - val_loss: 1.0388
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9067 - loss: 0.6242 - val_accuracy: 0.6712 - val_loss: 1.0161
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.9331 - loss: 0.4828 - val_accuracy: 0.6614 - val_loss: 0.9654
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.9194 - loss: 0.4481 - val_accuracy: 0.6571 - val_loss: 1.1634
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 41ms/step - accuracy: 0.4309 - loss: 1.8439 - val_accuracy: 0.4520 - val_loss: 1.5386
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.7868 - loss: 1.2306 - val_accuracy: 0.5741 - val_loss: 1.1742
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.8850 - loss: 0.7921 - val_accuracy: 0.5410 - val_loss: 1.2520
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.9015 - loss: 0.6435 - val_accuracy: 0.6750 - val_loss: 1.0003
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9182 - loss: 0.5311 - val_accuracy: 0.6858 - val_loss: 1.0287
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 0.9261 - loss: 0.4792 - val_accuracy: 0.6804 - val_loss: 0.9915
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - accuracy: 0.3388 - loss: 1.8777 - val_accuracy: 0.4216 - val_loss: 1.6282
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.6652 - loss: 1.3618 - val_accuracy: 0.4590 - val_loss: 1.4496
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.8044 - loss: 1.0192 - val_accuracy: 0.5768 - val_loss: 1.2406
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.8914 - loss: 0.6887 - val_accuracy: 0.6462 - val_loss: 1.1402
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.9398 - loss: 0.4995 - val_accuracy: 0.6571 - val_loss: 1.1858
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.9609 - loss: 0.4129 - val_accuracy: 0.6658 - val_loss: 1.1771
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━

# RNN + Count Vectoriser

### Loss of Sequential Information
Poor performance because RNNs are not well-suited to the bag-of-words representation generated by `CountVectorizer`. Since `CountVectorizer` treats each document as a set of words without any order, words are represented only by their counts, not by their position in the text. Since RNNs are designed to work with ordered sequences, where the position and context of words matter, without preserving word order, the RNN cannot capture dependencies between words over time.

### Sparse, non-contextual input
`CountVectorizer` produces a sparse representation where each word is treated as an independent feature based on its frequency. There is no semantic or contextual relationship between words, and the word counts lack dense, meaningful relationships that an RNN could leverage, since RNNs perform best with dense, continuous data that represents meaningful relationships between words, typically achieved with word embeddings.

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
import numpy as np
import random
import os

# Set to CPU only
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable GPU

# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Parameters
max_features = 5000  # Limit vocabulary to 5000 words
l2_lambda = 0.01 

# Step 1: Vectorize Text Data using CountVectorizer
vectorizer = CountVectorizer(max_features=max_features)
X_count = vectorizer.fit_transform(data['processed_full_review']).toarray()

# Labels
sentiment_dict = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
y = data['sentiment'].map(sentiment_dict).values

# Calculate class weights
class_weights_values = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights = {i: class_weights_values[i] for i in range(len(class_weights_values))}

# Define stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

# Cross-validation loop
for fold, (train_index, test_index) in enumerate(skf.split(X_count, y)):
    print(f"\nTraining fold {fold + 1}...\n")
    
    X_train, X_test = X_count[train_index], X_count[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Define the model architecture
    model = Sequential()
    model.add(SimpleRNN(64, activation='tanh', kernel_regularizer=tf.keras.regularizers.l2(l2_lambda)))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(l2_lambda)))
    
    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    # Train the model with early stopping and class weights
    model.fit(
        X_train, y_train, 
        epochs=10, 
        batch_size=128,  
        validation_split=0.2, 
        verbose=1,
        callbacks=[early_stopping],
        class_weight=class_weights
    )
    
    # Predictions and evaluation for the current fold
    y_pred_prob = model.predict(X_test)
    y_pred = np.argmax(y_pred_prob, axis=1)
    
    # Calculate metrics for the current fold
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive'], zero_division=0, output_dict=True)
    f1 = report['weighted avg']['f1-score']
    
    accuracy_scores.append(accuracy)
    f1_scores.append(f1)
    
    print(f"Fold {fold + 1} Accuracy: {accuracy:.4f}")
    print(f"Fold {fold + 1} F1 Score: {f1:.4f}")
    print(f"Fold {fold + 1} Classification Report:\n", classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive'], zero_division=0, digits=4))

# Print average metrics across all folds
print("\nAverage Metrics across folds:")
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")

# RNN + Count Vectoriser + Conversion to pseudo-sequences with word indices

Performance is better than Basic RNN.

Over here, we transform the `CountVectorizer` output into integer sequences which is compatible with the embedding layer. 

Why `CountVectorizer` is better here is because sentiment analysis often hinges more on the presence of certain key words rather than on the strict order of words in a sequence. Unlike other NLP tasks where the exact sequence of words matters (e.g. translation or grammar correction), sentiment analysis can often succeed with just the occurrence or frequency of these key items. `CountVectorizer` captures this by creating a bag-of-words representation that prioritises word presence and frequency, which is often enough for sentiment detection.



In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
import numpy as np
import random

tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Parameters
max_features = 5000       # Limit vocabulary to 5000 words
embedding_dim = 128        # Embedding dimensions for each word
max_sequence_length = 300 # Max number of words in each sequence

# Step 1: Text Vectorization using CountVectorizer
vectorizer = CountVectorizer(max_features=max_features)
X_counts = vectorizer.fit_transform(data['processed_full_review'])
word_index = vectorizer.vocabulary_

# Inverse vocabulary mapping for sequences creation
index_to_word = {i: word for word, i in word_index.items()}

def counts_to_sequences(X_counts):
    sequences = []
    for i in range(X_counts.shape[0]):
        indices = X_counts[i].nonzero()[1]
        words = [index_to_word[idx] for idx in indices]
        seq = [word_index[word] + 1 for word in words]  # +1 because 0 is reserved for padding
        sequences.append(seq)
    return sequences

sequences = counts_to_sequences(X_counts)
X_padded = pad_sequences(sequences, maxlen=max_sequence_length)

# Labels
sentiment_dict = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
y = data['sentiment'].map(sentiment_dict).values

X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

model = Sequential()
model.add(Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim, input_length=max_sequence_length, trainable=True))
model.add(SimpleRNN(64, activation='tanh', input_shape=(X_train_reshaped.shape[1], 1)))  # Input shape adjusted
model.add(Dropout(0.5))  # Dropout for regularization
model.add(Dense(3, activation='softmax'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

model.fit(X_train, y_train, epochs=10, batch_size=64,  validation_split=0.2, verbose=1, class_weight=class_weights_dict)

y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# Calculate and print classification report
report = classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive'], zero_division=0, digits=4)
print('Performance Metrics:\n', report)

Epoch 1/10


  super().__init__(**kwargs)


[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 43ms/step - accuracy: 0.4720 - loss: 1.0589 - val_accuracy: 0.7911 - val_loss: 0.5905
Epoch 2/10
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.8687 - loss: 0.5639 - val_accuracy: 0.7629 - val_loss: 0.5588
Epoch 3/10
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 43ms/step - accuracy: 0.9391 - loss: 0.2436 - val_accuracy: 0.7466 - val_loss: 0.6518
Epoch 4/10
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 46ms/step - accuracy: 0.9750 - loss: 0.0925 - val_accuracy: 0.7982 - val_loss: 0.6056
Epoch 5/10
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 53ms/step - accuracy: 0.9902 - loss: 0.0385 - val_accuracy: 0.8041 - val_loss: 0.6644
Epoch 6/10
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 52ms/step - accuracy: 0.9983 - loss: 0.0145 - val_accuracy: 0.8041 - val_loss: 0.7157
Epoch 7/10
[1m116/116[0m [32m━

# RNN + Within model trained Word2Vec

`Word2Vec` performs worse than `CountVectorizer`.

Because our dataset is only 10k rows, Word2Vec embeddings might lack the depth needed for nuanced sentiment patterns, particularly without pre-training on a larger corpus. If Word2Vec embeddings do not generalise well or have insufficient context, the RNN might not capture subtle sentiment signals in the text, which can degrade model performance. In contrast, CountVectorizer builds a fixed vocab of words based on frequency, and does not need to learn semantic relationships among words, making it robust in cases where the model vocab size is small. 


In [9]:
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import numpy as np
import random
import os
import nltk
from nltk.tokenize import word_tokenize

# Ensure NLTK's punkt tokenizer is downloaded
# nltk.download('punkt')

# Set to CPU only
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable GPU

# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Parameters
embedding_dim = 128       # Dimension of Word2Vec embeddings
max_sequence_length = 300 # Max number of words in each sequence
l2_lambda = 0.01 

# Step 1: Tokenize the text data
tokenized_reviews = [word_tokenize(review.lower()) for review in data['processed_full_review']]

# Step 2: Train Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_reviews, vector_size=embedding_dim, window=5, min_count=1, sg=1, seed=42)

# Step 3: Prepare embedding matrix
vocab_size = len(word2vec_model.wv.key_to_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Map Word2Vec vectors to the embedding matrix
word_index = {word: idx + 1 for idx, word in enumerate(word2vec_model.wv.key_to_index)}
for word, idx in word_index.items():
    embedding_matrix[idx] = word2vec_model.wv[word]

# Step 4: Convert reviews to sequences of word indices
sequences = [[word_index.get(word, 0) for word in review] for review in tokenized_reviews]
X_padded = pad_sequences(sequences, maxlen=max_sequence_length)

# Labels
sentiment_dict = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
y = data['sentiment'].map(sentiment_dict).values

# Calculate class weights
class_weights_values = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights = {i: class_weights_values[i] for i in range(len(class_weights_values))}

# Define stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

# Cross-validation loop
for fold, (train_index, test_index) in enumerate(skf.split(X_padded, y)):
    print(f"\nTraining fold {fold + 1}...\n")
    
    X_train, X_test = X_padded[train_index], X_padded[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Define the model architecture
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, 
                        weights=[embedding_matrix], input_length=max_sequence_length, trainable=True))
    model.add(SimpleRNN(64, activation='tanh', kernel_regularizer=tf.keras.regularizers.l2(l2_lambda)))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(l2_lambda)))
    
    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    # Train the model with early stopping and class weights
    model.fit(
        X_train, y_train, 
        epochs=10, 
        batch_size=128,  
        validation_split=0.2, 
        verbose=1,
        callbacks=[early_stopping],
        class_weight=class_weights
    )
    
    # Predictions and evaluation for the current fold
    y_pred_prob = model.predict(X_test)
    y_pred = np.argmax(y_pred_prob, axis=1)
    
    # Calculate metrics for the current fold
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive'], zero_division=0, output_dict=True)
    f1 = report['weighted avg']['f1-score']
    
    accuracy_scores.append(accuracy)
    f1_scores.append(f1)
    
    print(f"Fold {fold + 1} Accuracy: {accuracy:.4f}")
    print(f"Fold {fold + 1} F1 Score: {f1:.4f}")
    print(f"Fold {fold + 1} Classification Report:\n", classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive'], zero_division=0, digits=4))

# Print average metrics across all folds
print("\nAverage Metrics across folds:")
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")



Training fold 1...

Epoch 1/10




[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.4849 - loss: 1.9380 - val_accuracy: 0.6837 - val_loss: 1.5286
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.7381 - loss: 1.3876 - val_accuracy: 0.6609 - val_loss: 1.3610
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.7954 - loss: 1.1780 - val_accuracy: 0.7645 - val_loss: 1.1434
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.8274 - loss: 1.0645 - val_accuracy: 0.7558 - val_loss: 1.1008
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.8706 - loss: 0.8937 - val_accuracy: 0.7537 - val_loss: 1.0697
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 40ms/step - accuracy: 0.8855 - loss: 0.7672 - val_accuracy: 0.6777 - val_loss: 1.2103
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - accuracy: 0.4887 - loss: 1.8783 - val_accuracy: 0.6739 - val_loss: 1.5087
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.7657 - loss: 1.3937 - val_accuracy: 0.6848 - val_loss: 1.3813
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.7806 - loss: 1.2170 - val_accuracy: 0.6837 - val_loss: 1.2913
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.8237 - loss: 1.0858 - val_accuracy: 0.7065 - val_loss: 1.1848
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.8050 - loss: 1.0093 - val_accuracy: 0.6869 - val_loss: 1.2237
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.7932 - loss: 1.0186 - val_accuracy: 0.6538 - val_loss: 1.2516
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 37ms/step - accuracy: 0.4021 - loss: 1.9702 - val_accuracy: 0.6663 - val_loss: 1.5417
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.6488 - loss: 1.4852 - val_accuracy: 0.6685 - val_loss: 1.3594
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.7712 - loss: 1.2328 - val_accuracy: 0.6647 - val_loss: 1.3095
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.7933 - loss: 1.0721 - val_accuracy: 0.7162 - val_loss: 1.1402
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.8230 - loss: 0.9758 - val_accuracy: 0.6283 - val_loss: 1.2691
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.8585 - loss: 0.7959 - val_accuracy: 0.6359 - val_loss: 1.2068
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 37ms/step - accuracy: 0.4166 - loss: 1.9690 - val_accuracy: 0.5789 - val_loss: 1.6101
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.7192 - loss: 1.4167 - val_accuracy: 0.5751 - val_loss: 1.4711
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.7904 - loss: 1.1667 - val_accuracy: 0.6978 - val_loss: 1.2147
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.8288 - loss: 0.9883 - val_accuracy: 0.7173 - val_loss: 1.1212
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.8347 - loss: 0.8659 - val_accuracy: 0.6256 - val_loss: 1.2570
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.8763 - loss: 0.7570 - val_accuracy: 0.6603 - val_loss: 1.1869
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.3788 - loss: 2.0308 - val_accuracy: 0.5453 - val_loss: 1.7255
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.6809 - loss: 1.4950 - val_accuracy: 0.6142 - val_loss: 1.4401
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 39ms/step - accuracy: 0.7348 - loss: 1.3049 - val_accuracy: 0.7103 - val_loss: 1.2388
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.8072 - loss: 1.1130 - val_accuracy: 0.6451 - val_loss: 1.2761
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.8377 - loss: 0.9486 - val_accuracy: 0.6989 - val_loss: 1.2035
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 38ms/step - accuracy: 0.8420 - loss: 0.8765 - val_accuracy: 0.7249 - val_loss: 1.1054
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━

# RNN + FastText

In [10]:
from gensim.models import FastText
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import numpy as np
import random
import os
import nltk
from nltk.tokenize import word_tokenize

# Ensure NLTK's punkt tokenizer is downloaded
# nltk.download('punkt')

# Set to CPU only
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable GPU

# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Parameters
embedding_dim = 128       # Dimension of FastText embeddings
max_sequence_length = 300 # Max number of words in each sequence
l2_lambda = 0.01 

# Step 1: Tokenize the text data
tokenized_reviews = [word_tokenize(review.lower()) for review in data['processed_full_review']]

# Step 2: Train FastText model
fasttext_model = FastText(sentences=tokenized_reviews, vector_size=embedding_dim, window=5, min_count=1, sg=1, seed=42)

# Step 3: Prepare embedding matrix
vocab_size = len(fasttext_model.wv.key_to_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))

# Map FastText vectors to the embedding matrix
word_index = {word: idx + 1 for idx, word in enumerate(fasttext_model.wv.key_to_index)}
for word, idx in word_index.items():
    embedding_matrix[idx] = fasttext_model.wv[word]

# Step 4: Convert reviews to sequences of word indices
sequences = [[word_index.get(word, 0) for word in review] for review in tokenized_reviews]
X_padded = pad_sequences(sequences, maxlen=max_sequence_length)

# Labels
sentiment_dict = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
y = data['sentiment'].map(sentiment_dict).values

# Calculate class weights
class_weights_values = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights = {i: class_weights_values[i] for i in range(len(class_weights_values))}

# Define stratified 5-fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

# Cross-validation loop
for fold, (train_index, test_index) in enumerate(skf.split(X_padded, y)):
    print(f"\nTraining fold {fold + 1}...\n")
    
    X_train, X_test = X_padded[train_index], X_padded[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    # Define the model architecture
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, 
                        weights=[embedding_matrix], input_length=max_sequence_length, trainable=True))
    model.add(SimpleRNN(64, activation='tanh', kernel_regularizer=tf.keras.regularizers.l2(l2_lambda)))
    model.add(Dropout(0.5))
    model.add(Dense(3, activation='softmax', kernel_regularizer=tf.keras.regularizers.l2(l2_lambda)))
    
    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    # Early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
    
    # Train the model with early stopping and class weights
    model.fit(
        X_train, y_train, 
        epochs=10, 
        batch_size=128,  
        validation_split=0.2, 
        verbose=1,
        callbacks=[early_stopping],
        class_weight=class_weights
    )
    
    # Predictions and evaluation for the current fold
    y_pred_prob = model.predict(X_test)
    y_pred = np.argmax(y_pred_prob, axis=1)
    
    # Calculate metrics for the current fold
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive'], zero_division=0, output_dict=True)
    f1 = report['weighted avg']['f1-score']
    
    accuracy_scores.append(accuracy)
    f1_scores.append(f1)
    
    print(f"Fold {fold + 1} Accuracy: {accuracy:.4f}")
    print(f"Fold {fold + 1} F1 Score: {f1:.4f}")
    print(f"Fold {fold + 1} Classification Report:\n", classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive'], zero_division=0, digits=4))

# Print average metrics across all folds
print("\nAverage Metrics across folds:")
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")



Training fold 1...

Epoch 1/10




[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - accuracy: 0.5104 - loss: 1.8729 - val_accuracy: 0.6218 - val_loss: 1.6363
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.7206 - loss: 1.4704 - val_accuracy: 0.6511 - val_loss: 1.4793
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.7615 - loss: 1.2732 - val_accuracy: 0.6853 - val_loss: 1.3166
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.8114 - loss: 1.0917 - val_accuracy: 0.7493 - val_loss: 1.1498
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.8090 - loss: 1.0071 - val_accuracy: 0.7417 - val_loss: 1.1711
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.7830 - loss: 1.0443 - val_accuracy: 0.7287 - val_loss: 1.1582
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 38ms/step - accuracy: 0.4508 - loss: 1.9211 - val_accuracy: 0.5784 - val_loss: 1.6398
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.7369 - loss: 1.4191 - val_accuracy: 0.5654 - val_loss: 1.6890
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.7457 - loss: 1.2806 - val_accuracy: 0.7043 - val_loss: 1.2339
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.8241 - loss: 1.0367 - val_accuracy: 0.7103 - val_loss: 1.1498
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.8276 - loss: 0.9481 - val_accuracy: 0.6951 - val_loss: 1.1527
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.8844 - loss: 0.7649 - val_accuracy: 0.6896 - val_loss: 1.1666
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 40ms/step - accuracy: 0.4053 - loss: 1.9778 - val_accuracy: 0.5920 - val_loss: 1.6161
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.7103 - loss: 1.4139 - val_accuracy: 0.5860 - val_loss: 1.4802
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.7608 - loss: 1.2011 - val_accuracy: 0.6652 - val_loss: 1.2982
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.8194 - loss: 1.0575 - val_accuracy: 0.7027 - val_loss: 1.1849
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 35ms/step - accuracy: 0.8290 - loss: 0.9661 - val_accuracy: 0.7081 - val_loss: 1.1689
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.8699 - loss: 0.8546 - val_accuracy: 0.6858 - val_loss: 1.1345
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 39ms/step - accuracy: 0.4473 - loss: 1.9335 - val_accuracy: 0.6712 - val_loss: 1.5310
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 36ms/step - accuracy: 0.7140 - loss: 1.4322 - val_accuracy: 0.6886 - val_loss: 1.3407
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.8039 - loss: 1.1686 - val_accuracy: 0.6093 - val_loss: 1.3714
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.6829 - loss: 1.1582 - val_accuracy: 0.6853 - val_loss: 1.2525
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.8546 - loss: 0.9164 - val_accuracy: 0.6810 - val_loss: 1.2097
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.8769 - loss: 0.8068 - val_accuracy: 0.7781 - val_loss: 1.0003
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━



[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 37ms/step - accuracy: 0.3995 - loss: 2.0047 - val_accuracy: 0.5328 - val_loss: 1.9617
Epoch 2/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 37ms/step - accuracy: 0.5888 - loss: 1.7293 - val_accuracy: 0.5898 - val_loss: 1.5581
Epoch 3/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.7108 - loss: 1.3025 - val_accuracy: 0.7379 - val_loss: 1.1770
Epoch 4/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.7765 - loss: 1.1194 - val_accuracy: 0.6191 - val_loss: 1.3764
Epoch 5/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - accuracy: 0.8018 - loss: 1.0019 - val_accuracy: 0.7021 - val_loss: 1.1315
Epoch 6/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.8157 - loss: 0.9090 - val_accuracy: 0.6782 - val_loss: 1.1528
Epoch 7/10
[1m58/58[0m [32m━━━━━━━━━━━━━━━

# RNN + Pre-trained Word2Vec

Pre-trained Word2Vec performs worse than within model trained Word2Vec.

Google's Word2Vec embeddings were trained on very general Google News dataset, which may not align well with the context or vocabulary of our specific dataset, while custom embeddings trained directly on our dataset are tailored to the specific language and sentiment patterns within it.

Since our dataset cotntains a lot of domain-specific terms and sentiment-heavy words that are less common in general news (like "amazing", "terrible", "refund"), pre-trained embeddings may not capture these terms accurately. Within-model embeddings can adapt specifically to the words and nuances in our dataset.

In [44]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from gensim.models import KeyedVectors
import tensorflow as tf
import numpy as np
import random

tf.random.set_seed(42)
np.random.seed(42)
random.seed(42)

# Parameters
vocab_size = 5000         # Limit vocabulary to 5000 words
embedding_dim = 300        # Embedding dimensions for each word
max_sequence_length = 300 # Max number of words in each sequence

# Step 1: Tokenize and Pad the Text
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(data['processed_full_review'])
sequences = tokenizer.texts_to_sequences(data['processed_full_review'])
X_padded = pad_sequences(sequences, maxlen=max_sequence_length)

# Labels
sentiment_dict = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
y = data['sentiment'].map(sentiment_dict).values

word2vec_model = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)

# Create Embedding Matrix with Pre-trained Word2Vec
embedding_matrix = np.zeros((vocab_size, embedding_dim))
word_index = tokenizer.word_index

for word, i in word_index.items():
    if i < vocab_size:
        # Retrieve the embedding vector for the word
        if word in word2vec_model:
            embedding_matrix[i] = word2vec_model[word]

X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Step 2: Define a Simple RNN Model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(SimpleRNN(64, activation='tanh'))
model.add(Dropout(0.5))  # Add dropout for regularization
model.add(Dense(3, activation='softmax'))   # Output layer for 3 classes

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

# Step 3: Train the Model
model.fit(X_train, y_train, epochs=10, batch_size=64,  validation_split=0.2, verbose=1, class_weight=class_weights_dict)

y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

# Calculate and print classification report
report = classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive'], zero_division=0, digits=4)
print('Performance Metrics:\n', report)

Epoch 1/10




[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 41ms/step - accuracy: 0.4276 - loss: 1.1304 - val_accuracy: 0.7184 - val_loss: 0.6579
Epoch 2/10
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.6504 - loss: 0.8589 - val_accuracy: 0.6994 - val_loss: 0.7364
Epoch 3/10
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 40ms/step - accuracy: 0.6428 - loss: 0.8659 - val_accuracy: 0.6804 - val_loss: 0.9888
Epoch 4/10
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - accuracy: 0.5904 - loss: 1.0298 - val_accuracy: 0.7303 - val_loss: 0.6904
Epoch 5/10
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - accuracy: 0.7149 - loss: 0.7646 - val_accuracy: 0.7656 - val_loss: 0.5968
Epoch 6/10
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 39ms/step - accuracy: 0.7062 - loss: 0.7707 - val_accuracy: 0.7699 - val_loss: 0.5865
Epoch 7/10
[1m116/116[0m [32m━