In [2]:
import os
import re
import string

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import BernoulliNB

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Embedding, LSTM, SpatialDropout1D

tf.config.run_functions_eagerly(True)

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
import keras

In [3]:
# Load data
books_data = pd.read_csv(r"C:\Users\wu02x\Downloads\SC4021\new_pre_processed_data.csv")
books_data.head()

Unnamed: 0,comment_text,sentiment
0,commenttext,2
1,ded get call cthulhu weird tale vintage classi...,3
2,love lovecraft agree call cthulhu not best wor...,3
3,call cthulhu never best work simply popular we...,2
4,shadow over innsmouth second popular work righ...,2


In [4]:
books_data.sentiment.value_counts()

sentiment
1    13906
2     9258
3     6448
0     4354
Name: count, dtype: int64

In [5]:
books_data = books_data[(books_data['sentiment'] == 0) | (books_data['sentiment'] == 1)]

In [6]:
# downsample the data due to impbalance in positive and negative data
positive_class_samples = books_data[books_data['sentiment'] == 1].sample(n=len(books_data[books_data['sentiment'] == 0]), random_state=42)
negative_class_samples = books_data[books_data['sentiment'] == 0]
# Concatenate minority and majority class samples
books_data = pd.concat([positive_class_samples, negative_class_samples])

# Shuffle the dataset
books_data = books_data.sample(frac=1, random_state=42)
books_data.sentiment.value_counts()

sentiment
1    4354
0    4354
Name: count, dtype: int64

In [7]:
# Separating the 80% data for training data and 20% for testing data and maintain equal ratio of classes in the train and test sample
X_train, X_test, y_train, y_test = train_test_split(books_data['comment_text'], books_data['sentiment'], test_size=0.2, stratify=books_data['sentiment'], random_state=42)

In [8]:
s = 0.0
for i in books_data ['comment_text']:
    word_list = i.split()
    s = s + len(word_list)
print("Average length of each review : ",s/books_data .shape[0])

Average length of each review :  42.25861276986679


In [9]:
# Find vocab size
# Concatenate all the text in the 'comment_text' column into a single string
all_text = ' '.join(books_data ['comment_text'])

# Tokenize the string into individual words
words = all_text.split()

# Create a set from the tokenized words to remove duplicates
vocab = set(words)

# Find the length of the set, which will give you the vocabulary size
vocab_size = len(vocab)
#
print("Vocabulary size:", vocab_size)

Vocabulary size: 26685


In [10]:
# # Hyperparameters of the model
vocab_size = 28000
oov_tok = ''
embedding_dim = 50
max_length = 80
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen=max_length)

In [11]:
# Define the number of epochs
num_epochs = 100

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Define parameter
n_lstm = 128
drop_lstm = 0.2
# Define LSTM Model 
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_shape=(max_length,)))
model.add(SpatialDropout1D(drop_lstm))
model.add(LSTM(n_lstm, return_sequences=False))
model.add(Dropout(drop_lstm))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Fit model with early stopping
history = model.fit(train_padded, y_train,
                    epochs=num_epochs, verbose=1,
                    validation_data=(test_padded, y_test),
                    callbacks=[early_stopping]
                    )


  super().__init__(**kwargs)


Epoch 1/100




[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 781ms/step - accuracy: 0.5684 - loss: 0.6758 - val_accuracy: 0.7135 - val_loss: 0.5729
Epoch 2/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 800ms/step - accuracy: 0.8024 - loss: 0.4467 - val_accuracy: 0.7400 - val_loss: 0.5260
Epoch 3/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 781ms/step - accuracy: 0.8902 - loss: 0.2805 - val_accuracy: 0.7440 - val_loss: 0.5952
Epoch 4/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m163s[0m 747ms/step - accuracy: 0.9370 - loss: 0.1745 - val_accuracy: 0.7382 - val_loss: 0.7457
Epoch 5/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 787ms/step - accuracy: 0.9605 - loss: 0.1128 - val_accuracy: 0.7273 - val_loss: 0.8752


In [12]:
from sklearn.metrics import f1_score, average_precision_score, precision_score, recall_score, accuracy_score

# Make predictions on the test set
result = model.predict(test_padded)

# For example, you might round probabilities to the nearest integer
y_pred_binary = [1 if p > 0.5 else 0 for p in result]

# Compute F1 score
F1_score = f1_score(y_test, y_pred_binary)

# Calculate average precision
average_precision = average_precision_score(y_test, y_pred_binary)

# Calculate precision
precision = precision_score(y_test, y_pred_binary)

# Calculate recall
recall_score = recall_score(y_test, y_pred_binary)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_binary)

print('F1 score: {0:0.3f}'.format(F1_score))
print('Precision score: {0:0.3f}'.format(precision))
print('Recall score: {0:0.3f}'.format(recall_score))
# print('Average precision-recall score: {0:0.3f}'.format(average_precision))
print('Accuracy score: {0:0.3f}'.format(accuracy))

[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 307ms/step
F1 score: 0.732
Precision score: 0.720
Recall score: 0.745
Accuracy score: 0.727


## Evaluation

In [13]:
# Load evaluation dataset
eval_filepath = r"C:\Users\wu02x\Downloads\SC4021\evaluation_preprocessed_data.csv"
eval_data = pd.read_csv(eval_filepath)

In [14]:
# Convert the column from float to int
eval_data['manual_label'] = eval_data['manual_label'].astype(int)

In [15]:
X_eval = eval_data.comment_text
y_eval = eval_data.manual_label

In [16]:
# convert Test dataset to sequence and pad sequences
eval_sequences = tokenizer.texts_to_sequences(X_eval)
eval_padded = pad_sequences(eval_sequences, maxlen=max_length)

In [18]:
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from sklearn.metrics import classification_report


start_time = time.time()

# Make predictions on the evaluation set
result = model.predict(eval_padded)

y_pred_binary = [1 if p > 0.5 else 0 for p in result]

end_time = time.time()
classification_time = end_time - start_time

print("Classification Time for 1000 records:", classification_time, "seconds")

print(classification_report(y_eval, y_pred_binary, digits=4))

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 306ms/step
Classification Time for 1000 records: 7.780988454818726 seconds
              precision    recall  f1-score   support

           0     0.4610    0.8500    0.5978       160
           1     0.9503    0.7427    0.8338       618

    accuracy                         0.7648       778
   macro avg     0.7057    0.7964    0.7158       778
weighted avg     0.8497    0.7648    0.7853       778



In [19]:
from sklearn.metrics import f1_score, average_precision_score, precision_score, recall_score, accuracy_score

# Make predictions on the test set
result = model.predict(eval_padded)

# For example, you might round probabilities to the nearest integer
y_pred_binary = [1 if p > 0.5 else 0 for p in result]

# Compute F1 score
F1_score = f1_score(y_eval, y_pred_binary)

# Calculate average precision
average_precision = average_precision_score(y_eval, y_pred_binary)

# Calculate precision
precision = precision_score(y_eval, y_pred_binary)

# Calculate recall
recall_score = recall_score(y_eval, y_pred_binary)

# Calculate accuracy
accuracy = accuracy_score(y_eval, y_pred_binary)

print('F1 score: {0:0.3f}'.format(F1_score))
print('Precision score: {0:0.3f}'.format(precision))
print('Recall score: {0:0.3f}'.format(recall_score))
# print('Average precision-recall score: {0:0.3f}'.format(average_precision))
print('Accuracy score: {0:0.3f}'.format(accuracy))



[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 314ms/step
F1 score: 0.834
Precision score: 0.950
Recall score: 0.743
Accuracy score: 0.765
