In [None]:
import os
import re
import string

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import BernoulliNB

from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Embedding, LSTM, SpatialDropout1D

tf.config.run_functions_eagerly(True)

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Embedding
from tensorflow.keras.models import Sequential
import keras

In [None]:
# Load data
books_data = pd.read_csv(r"C:\Users\wu02x\Downloads\SC4021\new_pre_processed_data.csv")
books_data.head()

Unnamed: 0,comment_text,sentiment
0,commenttext,2
1,ded get call cthulhu weird tale vintage classi...,3
2,love lovecraft agree call cthulhu not best wor...,3
3,call cthulhu never best work simply popular we...,2
4,shadow over innsmouth second popular work righ...,2


In [None]:
books_data.sentiment.value_counts()

sentiment
1    13906
2     9258
3     6448
0     4354
Name: count, dtype: int64

In [None]:
books_data = books_data[(books_data['sentiment'] == 0) | (books_data['sentiment'] == 1)]

In [None]:
# downsample the data due to impbalance in positive and negative data
positive_class_samples = books_data[books_data['sentiment'] == 1].sample(n=len(books_data[books_data['sentiment'] == 0]), random_state=42)
negative_class_samples = books_data[books_data['sentiment'] == 0]
# Concatenate minority and majority class samples
books_data = pd.concat([positive_class_samples, negative_class_samples])

# Shuffle the dataset
books_data = books_data.sample(frac=1, random_state=42)
books_data.sentiment.value_counts()

sentiment
1    4354
0    4354
Name: count, dtype: int64

In [None]:
# Separating the 80% data for training data and 20% for testing data and maintain equal ratio of classes in the train and test sample
X_train, X_test, y_train, y_test = train_test_split(books_data['comment_text'], books_data['sentiment'], test_size=0.2, stratify=books_data['sentiment'], random_state=42)

In [None]:
s = 0.0
for i in books_data ['comment_text']:
    word_list = i.split()
    s = s + len(word_list)
print("Average length of each review : ",s/books_data .shape[0])

Average length of each review :  42.25861276986679


In [None]:
# Find vocab size
# Concatenate all the text in the 'comment_text' column into a single string
all_text = ' '.join(books_data ['comment_text'])

# Tokenize the string into individual words
words = all_text.split()

# Create a set from the tokenized words to remove duplicates
vocab = set(words)

# Find the length of the set, which will give you the vocabulary size
vocab_size = len(vocab)
#
print("Vocabulary size:", vocab_size)

Vocabulary size: 26685


In [None]:
# # Hyperparameters of the model
vocab_size = 28000
oov_tok = ''
embedding_dim = 50
max_length = 80
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen=max_length)

In [None]:
# Define the number of epochs
num_epochs = 100

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# Define parameter
n_lstm = 128
drop_lstm = 0.2
# Define LSTM Model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_shape=(max_length,)))
model.add(SpatialDropout1D(drop_lstm))
model.add(LSTM(n_lstm, return_sequences=False))
model.add(Dropout(drop_lstm))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

# Fit model with early stopping
history = model.fit(train_padded, y_train,
                    epochs=num_epochs, verbose=1,
                    validation_data=(test_padded, y_test),
                    callbacks=[early_stopping]
                    )


Epoch 1/100




[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 793ms/step - accuracy: 0.5727 - loss: 0.6738 - val_accuracy: 0.7107 - val_loss: 0.5667
Epoch 2/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 798ms/step - accuracy: 0.8112 - loss: 0.4415 - val_accuracy: 0.7532 - val_loss: 0.5117
Epoch 3/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 782ms/step - accuracy: 0.9021 - loss: 0.2506 - val_accuracy: 0.7480 - val_loss: 0.6141
Epoch 4/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 786ms/step - accuracy: 0.9418 - loss: 0.1644 - val_accuracy: 0.7440 - val_loss: 0.7056
Epoch 5/100
[1m218/218[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 839ms/step - accuracy: 0.9644 - loss: 0.1044 - val_accuracy: 0.7480 - val_loss: 0.8838


In [None]:
from sklearn.metrics import f1_score, average_precision_score, precision_score, recall_score, accuracy_score

# Make predictions on the test set
result = model.predict(test_padded)

# For example, you might round probabilities to the nearest integer
y_pred_binary = [1 if p > 0.5 else 0 for p in result]

# Compute F1 score
F1_score = f1_score(y_test, y_pred_binary)

# Calculate average precision
average_precision = average_precision_score(y_test, y_pred_binary)

# Calculate precision
precision = precision_score(y_test, y_pred_binary)

# Calculate recall
recall_score = recall_score(y_test, y_pred_binary)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_binary)

print('F1 score: {0:0.3f}'.format(F1_score))
print('Precision score: {0:0.3f}'.format(precision))
print('Recall score: {0:0.3f}'.format(recall_score))
# print('Average precision-recall score: {0:0.3f}'.format(average_precision))
print('Accuracy score: {0:0.3f}'.format(accuracy))

[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 260ms/step
F1 score: 0.748
Precision score: 0.747
Recall score: 0.750
Accuracy score: 0.748
