In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import re
import os
import string

import nltk

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Dropout
from tensorflow.keras.callbacks import EarlyStopping

In [5]:
# Load data
books_data = pd.read_csv(r"C:\Users\wu02x\Downloads\SC4021\pre_processed_data.csv")
books_data.head()

Unnamed: 0,comment_text,sentiment
0,started reading catch22 but year read consider...,1
1,way dune written might favorite canticle leibo...,1
2,talked school year ago told class creation hid...,2
3,here thought reading first twothis series defi...,1
4,liked exactly dislikedthats would probably pre...,2


In [6]:
books_data.sentiment.value_counts()

sentiment
1    3077
2    3077
0    3077
Name: count, dtype: int64

In [7]:
books_data = books_data[(books_data['sentiment'] == 0) | (books_data['sentiment'] == 1)]

In [8]:
# Separating the 80% data for training data and 20% for testing data and maintain equal ratio of classes in the train and test sample
X_train, X_test, y_train, y_test = train_test_split(books_data['comment_text'], books_data['sentiment'], test_size=0.2, stratify=books_data['sentiment'], random_state=42)

In [9]:
# vectorization
vect = CountVectorizer()
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [10]:
# frequency, inverse document frequency
tfidf = TfidfTransformer()
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)
X_train = X_train.toarray()
X_test = X_test.toarray()

In [13]:
model = Sequential()
model.add(Dense(units=1000, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(units=250, activation="relu"))
model.add(Dropout(0.3))
model.add(Dense(units=100, activation="relu"))
model.add(Dropout(0.3))
# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=3)

In [14]:
history = model.fit(
    x=X_train,
    y=y_train,
    epochs=100,
    validation_data=(X_test, y_test),
    verbose=1,
    callbacks=early_stop,
)

Epoch 1/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 166ms/step - accuracy: 0.6348 - loss: 0.6286 - val_accuracy: 0.7571 - val_loss: 0.4963
Epoch 2/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 143ms/step - accuracy: 0.9193 - loss: 0.2187 - val_accuracy: 0.7425 - val_loss: 0.5817
Epoch 3/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 163ms/step - accuracy: 0.9808 - loss: 0.0649 - val_accuracy: 0.7344 - val_loss: 0.9781
Epoch 4/100
[1m154/154[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 164ms/step - accuracy: 0.9960 - loss: 0.0155 - val_accuracy: 0.7335 - val_loss: 1.3376
Epoch 4: early stopping


In [20]:
from sklearn.metrics import f1_score,average_precision_score, precision_score, recall_score, accuracy_score

# Make predictions on the test set
result = model.predict(X_test)

# For example, you might round probabilities to the nearest integer
y_pred_binary = [1 if p > 0.5 else 0 for p in result]

# Compute F1 score
F1_score = f1_score(y_test, y_pred_binary)

# Calculate average precision
average_precision = average_precision_score(y_test, y_pred_binary)

# Calculate precision
precision = precision_score(y_test, y_pred_binary)

# Calculate recall
recall_score = recall_score(y_test, y_pred_binary)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_binary)

print('F1 score: {0:0.3f}'.format(F1_score))
print('Precision score: {0:0.3f}'.format(precision))
print('Recall score: {0:0.3f}'.format(recall_score))
# print('Average precision-recall score: {0:0.3f}'.format(average_precision))
print('Accuracy score: {0:0.3f}'.format(accuracy))

[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step
F1 score: 0.750
Precision score: 0.706
Recall score: 0.798
Accuracy score: 0.734


## Evaluation

In [15]:
# Load evaluation dataset
eval_filepath = r"C:\Users\wu02x\Downloads\SC4021\evaluation_preprocessed_data.csv"
eval_data = pd.read_csv(eval_filepath)

In [16]:
# Convert the column from float to int
eval_data['manual_label'] = eval_data['manual_label'].astype(int)

In [17]:
X_eval = eval_data.comment_text
y_eval = eval_data.manual_label

In [18]:
X_eval = vect.transform(X_eval)
X_eval= tfidf.transform(X_eval)
X_eval = X_eval.toarray()

In [33]:
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from sklearn.metrics import classification_report


start_time = time.time()

# Make predictions on the evaluation set
result = model.predict(X_eval)

y_pred_binary = [1 if p > 0.5 else 0 for p in result]

end_time = time.time()
classification_time = end_time - start_time

print("Classification Time for 1000 records:", classification_time, "seconds")

print(classification_report(y_eval, y_pred_binary, digits=4))

[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step
Classification Time for 1000 records: 0.7544341087341309 seconds
              precision    recall  f1-score   support

           0     0.5209    0.8562    0.6478       160
           1     0.9553    0.7961    0.8685       618

    accuracy                         0.8085       778
   macro avg     0.7381    0.8262    0.7581       778
weighted avg     0.8660    0.8085    0.8231       778



In [34]:
from sklearn import metrics

F1_score = metrics.f1_score(y_eval, y_pred_binary)
average_precision = metrics.average_precision_score(y_eval, y_pred_binary)
precision_score= metrics.precision_score(y_eval, y_pred_binary)
recall_score = metrics.recall_score(y_eval, y_pred_binary)
accuracy = metrics.accuracy_score(y_eval, y_pred_binary)

print('F1 score: {0:0.3f}'.format(F1_score))
print('Precision score: {0:0.3f}'.format(precision_score))
print('Recall score: {0:0.3f}'.format(recall_score))
print('Average precision-recall score: {0:0.3f}'.format(average_precision))
print('Accuracy score: {0:0.3f}'.format(accuracy))

F1 score: 0.868
Precision score: 0.955
Recall score: 0.796
Average precision-recall score: 0.923
Accuracy score: 0.808
