## Import libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import required libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns

import re
import string

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## Load dataset

In [None]:
# Load data
books_data = pd.read_csv('/content/drive/MyDrive/SC4021/new_pre_processed_data.csv')
books_data.head()

Unnamed: 0,comment_text,sentiment
0,commenttext,2
1,ded get call cthulhu weird tale vintage classi...,3
2,love lovecraft agree call cthulhu not best wor...,3
3,call cthulhu never best work simply popular we...,2
4,shadow over innsmouth second popular work righ...,2


In [None]:
books_data.sentiment.value_counts()

sentiment
1    13906
2     9258
3     6448
0     4354
Name: count, dtype: int64

In [None]:
mask = (books_data['sentiment'] == 0) | (books_data['sentiment'] == 1)
books_data = books_data[mask]

In [None]:
X = books_data.comment_text
y = books_data.sentiment

In [None]:
# Separating the 80% data for training data and 20% for testing data and maintain equal ratio of classes in the train and test sample
X_train, X_test, y_train, y_test = train_test_split(books_data['comment_text'], books_data['sentiment'], test_size=0.2, stratify=books_data['sentiment'], random_state=42)

## TF-IDF Vectorizer

In [None]:
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))

No. of feature_words:  384461


In [None]:
X_train_vectorized = vectoriser.transform(X_train)
X_test_vectorized  = vectoriser.transform(X_test)

## Models

#### 1. Bernoulli Naive Bayes Classifier

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', BernoulliNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [None]:
from sklearn.metrics import classification_report
import time

start_time = time.time()
clf = GridSearchCV(text_clf, tuned_parameters, cv=10)
clf.fit(X_train, y_train)
end_time = time.time()
training_time = end_time - start_time
print("Training Time:", training_time, "seconds")

print(classification_report(y_test, clf.predict(X_test), digits=4))

Training Time: 980.3628849983215 seconds
              precision    recall  f1-score   support

           0     0.7653    0.2434    0.3693       871
           1     0.8047    0.9766    0.8824      2781

    accuracy                         0.8018      3652
   macro avg     0.7850    0.6100    0.6259      3652
weighted avg     0.7953    0.8018    0.7600      3652



In [None]:
type(X_test)

## Evaluation


In [None]:
# Load evaluation dataset
eval_filepath = r"/content/drive/MyDrive/SC4021/evaluation_preprocessed_data.csv"
eval_data = pd.read_csv(eval_filepath)

In [None]:
eval_data.head()

Unnamed: 0,comment_text,manual_label
0,like dracula need read dracula tape b dracula ...,1
1,dracula thread 2022 no one mentioned fantastic...,1
2,dracula first published 26th may last thursday...,0
3,love dracula im not remotely interested vampir...,1
4,vampire general foolish not sure dracula moder...,0


In [None]:
# Convert the column from float to int
eval_data['manual_label'] = eval_data['manual_label'].astype(int)

In [None]:
X_eval = eval_data.comment_text
y_eval = eval_data.manual_label

In [None]:
start_time = time.time()

predictions = clf.predict(X_eval)

end_time = time.time()
classification_time = end_time - start_time

print("Classification Time for 1000 records:", classification_time, "seconds")

print(classification_report(y_eval, predictions, digits=4))

Classification Time for 1000 records: 0.2512030601501465 seconds
              precision    recall  f1-score   support

           0     0.8099    0.6125    0.6975       160
           1     0.9056    0.9628    0.9333       618

    accuracy                         0.8907       778
   macro avg     0.8578    0.7876    0.8154       778
weighted avg     0.8859    0.8907    0.8848       778



In [None]:
from sklearn import metrics

F1_score = metrics.f1_score(y_eval, predictions)
average_precision = metrics.average_precision_score(y_eval, predictions)
precision_score= metrics.precision_score(y_eval, predictions)
recall_score = metrics.recall_score(y_eval, predictions)
accuracy = metrics.accuracy_score(y_eval, predictions)

print('F1 score: {0:0.3f}'.format(F1_score))
print('Precision score: {0:0.3f}'.format(precision_score))
print('Recall score: {0:0.3f}'.format(recall_score))
print('Average precision-recall score: {0:0.3f}'.format(average_precision))
print('Accuracy score: {0:0.3f}'.format(accuracy))

F1 score: 0.933
Precision score: 0.906
Recall score: 0.963
Average precision-recall score: 0.901
Accuracy score: 0.891


In [None]:
# Function to read Excel files from a directory and concatenate dataframes
def concat_excel_data(directory):
    all_dataframes = []  # List to store all dataframes

    # Iterate over each folder in the specified directory
    for folder_name in os.listdir(directory):
        folder_path = os.path.join(directory, folder_name)

        # Check if the current item is a directory
        if os.path.isdir(folder_path):
            # Iterate over each file in the current folder
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)

                # Check if the current file is an Excel file
                if file_name.endswith('.xlsx'):
                    # Read Excel file and append its dataframe to the list
                    df = pd.read_excel(file_path)
                    all_dataframes.append(df)

    # Concatenate all dataframes into one
    concatenated_df = pd.concat(all_dataframes, ignore_index=True)
    return concatenated_df

# Directory containing the folders with Excel files
labelled_data_directory = '/content/drive/MyDrive/SC4021/labelled_data'

# Call the function to concatenate Excel data from all folders
pre_eval_df = concat_excel_data(labelled_data_directory)

# Print first few rows of the concatenated dataframe
pre_eval_df.head()

Unnamed: 0,post_id,comment_text,author,created_utc,sentiment,manual_label,Unnamed: 6
0,post_id,comment_text,author,created_utc,2,,
1,928sye,I decided to get *The Call of Cthulhu and Othe...,1945BestYear,2018-07-27 12:32:19,3,,
2,dwqt6e,"I love Lovecraft. I agree with you, the call o...",Fitz_Fool,2019-11-15 17:42:49,3,,
3,dwqt6e,"The call of Cthulhu was never his best work, s...",Sir__Alucard,2019-11-15 22:04:14,2,,
4,dwqt6e,The shadow over innsmouth is his second most p...,Sir__Alucard,2019-11-15 22:06:29,2,,


In [None]:
import pandas as pd

# List of columns to check for NaN values
columns_to_check = ['manual_label']

# Drop records where any of the specified columns have NaN values
cleaned_df = pre_eval_df.dropna(subset=columns_to_check)

(1353, 7)

In [None]:
cleaned_df.head()

Unnamed: 0,post_id,comment_text,author,created_utc,sentiment,manual_label,Unnamed: 6
1302,v2g0b6,If you like Dracula you need to read The Dracu...,stumpdawg,2022-06-01 12:38:11,1,1.0,Same
1303,v2g0b6,A Dracula thread in 2022 and no one has mentio...,Pelirrojita,2022-06-01 14:12:01,1,1.0,Same
1304,v2g0b6,*Dracula* was first published 26th May. Last T...,WufflyTime,2022-06-01 14:50:08,3,0.0,Not same
1305,v2g0b6,"I love *Dracula,* and I'm not remotely interes...",Webbie-Vanderquack,2022-06-01 14:42:49,1,1.0,Same
1306,v2g0b6,"Vampires in general are foolish, not sure why ...",nyet-marionetka,2022-06-01 13:49:12,2,0.0,Not same


In [None]:
clean_df = cleaned_df[['comment_text', 'manual_label']]

In [None]:
# filter out irrelevant data
mask = (clean_df['manual_label'] == 1) | (clean_df['manual_label'] == 0)
clean_df = clean_df[mask]
clean_df.shape

(778, 2)

In [None]:
clean_df.to_csv('/content/drive/MyDrive/SC4021/evaluation_data.csv', index=False)