<a href="https://colab.research.google.com/github/jeromekithinji/COMP6321-SentimentAnalysis/blob/main/CNN_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.initializers import Constant
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# SETUP & GLOVE DOWNLOAD
if not os.path.exists('glove.6B.100d.txt'):
    print("Downloading GloVe embeddings...")
    !wget http://nlp.stanford.edu/data/glove.6B.zip
    !unzip -q glove.6B.zip
    print("Download complete.")
else:
    print("GloVe embeddings already available.")

print(f"TensorFlow Version: {tf.__version__}")
print(f"GPU Available: {len(tf.config.list_physical_devices('GPU')) > 0}")


In [None]:
# CONFIGURATION
MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 100
CSV_FILE_PATH = 'cleaned_data.csv'


In [None]:
# LOAD DATA
print("Loading dataset...")
try:
    df = pd.read_csv('/content/cleaned_amazon_reviews.csv')

    text_column = 'cleaned_text'
    label_column = 'sentiment'
    df = df.dropna(subset=[text_column])
    X = df[text_column].astype(str).values
    y = df[label_column].values

    print(f"Data loaded: {len(df)} rows.")

except FileNotFoundError:
    print(f"ERROR: Could not find '{CSV_FILE_PATH}'. Please upload it to the Colab files area.")
    raise
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Manual split for oversampling
X_train_pure, X_val_pure, y_train_pure, y_val_pure = train_test_split(
    X_train_text, y_train,
    test_size=0.1,
    random_state=42,
    stratify=y_train
)

print(f"Pure Training Shape: {X_train_pure.shape}")
print(f"Pure Validation Shape: {X_val_pure.shape}")

X_train_reshaped = X_train_pure.reshape(-1, 1)

ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train_reshaped, y_train_pure)

X_train_resampled = X_train_resampled.flatten()

print(f"Resampled Training Shape: {X_train_resampled.shape}")
print(f"New Class Distribution: {np.unique(y_train_resampled, return_counts=True)}")

In [None]:
# TOKENIZATION & PADDING
print("Tokenizing text...")
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")

tokenizer.fit_on_texts(X_train_resampled)
sequences_train = tokenizer.texts_to_sequences(X_train_resampled)
sequences_val = tokenizer.texts_to_sequences(X_val_pure)
sequences_test = tokenizer.texts_to_sequences(X_test_text)
X_train_padded = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_val_padded = pad_sequences(sequences_val, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_test_padded = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
y_train = y_train_resampled

print(f"Vocab size: {len(tokenizer.word_index)}")
print(f"Training Data Shape: {X_train_padded.shape}")
print(f"Validation Data Shape: {X_val_padded.shape}")


In [None]:
# PREPARE EMBEDDING MATRIX
print("Creating embedding matrix...")
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

word_index = tokenizer.word_index
num_words = min(MAX_VOCAB_SIZE, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= MAX_VOCAB_SIZE: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [None]:
from tensorflow.keras.layers import Input, Bidirectional, LSTM, MaxPooling1D
# STRATEGY 3: CNN-LSTM Hybrid + Unfrozen Embeddings
model = Sequential()
model.add(Input(shape=(MAX_SEQUENCE_LENGTH,)))

model.add(Embedding(
    num_words,
    EMBEDDING_DIM,
    embeddings_initializer=Constant(embedding_matrix),
    trainable=True
))

model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(64))

model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

# OLD implementation using unidirectional LSTM and Freezing embedding
# model = Sequential()
# model.add(Input(shape=(MAX_SEQUENCE_LENGTH,)))
# model.add(Embedding(
#     num_words,
#     EMBEDDING_DIM,
#     embeddings_initializer=Constant(embedding_matrix),
#     trainable=False
# ))

# model.add(Bidirectional(LSTM(64, return_sequences=False)))

# model.add(Dropout(0.5))
# model.add(Dense(3, activation='softmax'))

# model.compile(optimizer='adam',
#               loss='sparse_categorical_crossentropy',
#               metrics=['accuracy'])

# model.summary()

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

history = model.fit(
    X_train_padded, y_train,
    epochs=15,
    batch_size=32,
    validation_data=(X_val_padded, y_val_pure),
    callbacks=[early_stopping]
)

# OLD training before oversampling Strategy

# from sklearn.utils import class_weight
# class_weights = class_weight.compute_class_weight(
#     class_weight='balanced',
#     classes=np.unique(y_train),
#     y=y_train
# )
# class_weights_dict = dict(enumerate(class_weights))
# print(f"Class Weights: {class_weights_dict}")
# early_stopping = tf.keras.callbacks.EarlyStopping(
#     monitor='val_loss',
#     patience=3,
#     restore_best_weights=True
# )

# history = model.fit(
#     X_train_padded, y_train,
#     epochs=20,
#     batch_size=32,
#     validation_split=0.1,
#     class_weight=class_weights_dict,
#     callbacks=[early_stopping]
# )

In [None]:
# OLD Evaluation strategy before oversampling where we were forcing the model to be more focused on Neutral class by penalizing it more

# from sklearn.metrics import classification_report
# import numpy as np

# loss, accuracy = model.evaluate(X_test_padded, y_test, verbose=0)
# print(f"Standard Accuracy: {accuracy*100:.2f}%")

# y_pred_probs = model.predict(X_test_padded)

# final_predictions = []

# for prob in y_pred_probs:
#     # prob[1] is the probability of Neutral
#     if prob[1] > 0.35:
#         final_predictions.append(1)
#     else:
#         # If not Neutral, pick the winner between Negative (0) and Positive (2)
#         if prob[0] > prob[2]:
#             final_predictions.append(0)
#         else:
#             final_predictions.append(2)

# class_names = ['Negative (0)', 'Neutral (1)', 'Positive (2)']
# print(classification_report(y_test, final_predictions, target_names=class_names))

# loss, accuracy = model.evaluate(X_test_padded, y_test)
# print(f"Test Accuracy: {accuracy*100:.2f}%")


# New Evaluation for Oversampling Stragtegy used for 2nd and 3rd strategy

from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

print("Generating predictions...")
y_pred_probs = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred_probs, axis=1)

class_names = ['Negative (0)', 'Neutral (1)', 'Positive (2)']
print("Classification Report (Oversampled Model)")
print(classification_report(y_test, y_pred_classes, target_names=class_names))

cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.title('Confusion Matrix (Oversampled)')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
print("Generating predictions...")
y_pred_probs = model.predict(X_test_padded)
y_pred_classes = np.argmax(y_pred_probs, axis=1)

class_names = ['Negative (0)', 'Neutral (1)', 'Positive (2)']

In [None]:
# PLOT TRAINING HISTORY
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    epochs = range(1, len(acc) + 1)

    plt.figure(figsize=(14, 5))

    plt.subplot(1, 2, 1)
    plt.plot(epochs, acc, 'b-', label='Training Acc')
    plt.plot(epochs, val_acc, 'r-', label='Validation Acc')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)

    plt.subplot(1, 2, 2)
    plt.plot(epochs, loss, 'b-', label='Training Loss')
    plt.plot(epochs, val_loss, 'r-', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()
plot_history(history)

In [None]:
# NORMALIZED CONFUSION MATRIX
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(8, 6))
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='Greens',
            xticklabels=class_names,
            yticklabels=class_names)
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix (Normalized by Class)')
plt.show()