In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import pandas as pd
import tensorflow as tf

# Load in IMDB dataset
df = pd.read_csv('./Task2-Sentiment_Analysis_Using_Neural_Networks/data/imdb_labelled.txt', sep='\t+', header=None, names=['review', 'sentiment'], engine='python')

# Create a function to split data into train, validation, and test sets
def train_val_test_split(X, y, val_size=0.10, test_size=0.10, rand_seed=42):
    """
    Splits data into train, validation, and test sets.
    
    Args:
        X (pd.Series): Series of features
        y (pd.Series): Series of labels
        val_size (float): Proportion of data to use for validation set
        test_size (float): Proportion of data to use for test set
        rand_seed (int): Random seed for reproducibility
    
    Returns:
        Six pd.Series objects: X_train, X_val, X_test, y_train, y_val, y_test corresponding to
        the train, validation, and test sets for the features and labels respectively.
    """
    df = pd.DataFrame({X.name: X, y.name: y})
    
    val_int = int(val_size*len(df))
    test_int = int(test_size*len(df))
    
    train_split = len(df) - val_int - test_int
    val_split = len(df) - test_int
    
    shuf = df.sample(frac=1, random_state=rand_seed) # Shuffle rows
    
    X_train = shuf.iloc[:train_split][X.name]
    X_val = shuf.iloc[train_split:val_split][X.name]
    X_test = shuf.iloc[val_split:][X.name]
    y_train = shuf.iloc[:train_split][y.name]
    y_val = shuf.iloc[train_split:val_split][y.name]
    y_test = shuf.iloc[val_split:][y.name]
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Split data into train, validation, and test sets
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(df.review, df.sentiment)

# Use TextVectorization to create an input layer for the model that converts text to sequences of integers
vectorizer = tf.keras.layers.TextVectorization(max_tokens=5000, output_sequence_length=250)
vectorizer.adapt(X_train.values)


In [None]:
print(f'Vocab size : {vectorizer.vocabulary_size()}')
vocab = vectorizer.get_vocabulary()
print(f"Top 20 vocab items: {vocab[:20]}")
sample_text = ["This movie was fantastic!"]
sample_text_vectorized = vectorizer(sample_text)
print(f"Sample Text: {sample_text}")
print(f"Vectorized Text: {sample_text_vectorized}")
print(f"Tokens: {[vocab[word_index] for word_index in sample_text_vectorized[0]]}")

In [None]:

# Build and compile the model
model = tf.keras.models.Sequential()
model.add(vectorizer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(128, activation='relu'))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50)


In [None]:
# Try a different model architecture including an Embedding layer
# as well as using an LSTM layer

# model2 = tf.keras.Sequential([
#     vectorizer,
#     tf.keras.layers.Embedding(input_dim=len(vectorizer.get_vocabulary()), output_dim=64, mask_zero=True),
#     tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
#     tf.keras.layers.Dense(128, activation='relu'),
#     tf.keras.layers.Dense(1, activation='sigmoid')
# ])


model2 = tf.keras.Sequential()
model2.add(vectorizer)
model2.add(tf.keras.layers.Embedding(input_dim=len(vectorizer.get_vocabulary()), output_dim=64, mask_zero=True))
model2.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True, dropout=0.2)))
model2.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, dropout=0.2)))
model2.add(tf.keras.layers.Dense(16, activation='relu'))
model2.add(tf.keras.layers.Dense(1, activation='sigmoid'))

model2.compile(optimizer='adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics=['accuracy'])
model2.summary()

In [None]:
model2.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=30, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)])

In [None]:
# Plot the learning curves for loss and accuracy on the same plot
import matplotlib.pyplot as plt

def plot_learning_curves(history):
    plt.figure(figsize=(14,6))
    plt.subplot(1,2,1)
    plt.plot(history.history["loss"])
    plt.plot(history.history["val_loss"])
    plt.title("Loss - Training vs. Validation")
    plt.ylabel("Loss")
    plt.xlabel("Epoch")
    plt.legend(["Training", "Validation"])

    plt.subplot(1,2,2)
    plt.plot(history.history["accuracy"])
    plt.plot(history.history["val_accuracy"])
    plt.title("Accuracy - Training vs. Validation")
    plt.ylabel("Accuracy")
    plt.xlabel("Epoch")
    plt.legend(["Training", "Validation"])
    plt.show()
    
plot_learning_curves(model2.history)

In [None]:
model2.evaluate(X_test, y_test)

In [None]:
# Create a confusion matrix using numpy and pandas
# Plot confusion matrix with Seaborn heatmap
import numpy as np
import pandas as pd
import seaborn as sns

y_pred = model2.predict(X_test.values)
y_pred = np.where(y_pred > 0.5, 1, 0)
y_pred = y_pred.reshape(1, -1)[0]

def confusion_matrix(y_true, y_pred):
    data = {'y_Actual': y_true, 'y_Predicted': y_pred}
    df = pd.DataFrame(data, columns=['y_Actual','y_Predicted'])
    confusion_matrix = pd.crosstab(df['y_Actual'], df['y_Predicted'], rownames=['Actual'], colnames=['Predicted'])
    return confusion_matrix


def plot_confusion_matrix(y_true, y_pred):
    cmatrix = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8,6))
    sns.heatmap(cmatrix, annot=True)
    plt.title('Model Predictions Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()
    
plot_confusion_matrix(y_test, y_pred)