## Import libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# import required libraries
import os
import pandas as pd
import numpy as np
import seaborn as sns

import re
import string

import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

## Load dataset

In [None]:
# Load data
books_data = pd.read_csv('/content/drive/MyDrive/SC4021/pre_processed_data.csv')
books_data.head()

Unnamed: 0,comment_text,sentiment
0,started reading catch22 but year read consider...,1
1,way dune written might favorite canticle leibo...,1
2,talked school year ago told class creation hid...,2
3,here thought reading first twothis series defi...,1
4,liked exactly dislikedthats would probably pre...,2


In [None]:
X = books_data.comment_text
y = books_data.sentiment

In [None]:
# Separating the 80% data for training data and 20% for testing data and maintain equal ratio of classes in the train and test sample
X_train, X_test, y_train, y_test = train_test_split(books_data['comment_text'], books_data['sentiment'], test_size=0.2, stratify=books_data['sentiment'], random_state=42)

## TF-IDF Vectorizer

In [None]:
vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features=500000)
vectoriser.fit(X_train)
print('No. of feature_words: ', len(vectoriser.get_feature_names_out()))

No. of feature_words:  220543


In [None]:
X_train_vectorized = vectoriser.transform(X_train)
X_test_vectorized  = vectoriser.transform(X_test)

## Models

#### 1. Bernoulli Naive Bayes Classifier

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])
tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}

In [None]:
from sklearn.metrics import classification_report
clf = GridSearchCV(text_clf, tuned_parameters, cv=10)
clf.fit(X_train, y_train)

print(classification_report(y_test, clf.predict(X_test), digits=4))

              precision    recall  f1-score   support

           0     0.6021    0.6520    0.6261       615
           1     0.6027    0.6623    0.6311       616
           2     0.5198    0.4253    0.4679       616

    accuracy                         0.5799      1847
   macro avg     0.5749    0.5799    0.5750      1847
weighted avg     0.5749    0.5799    0.5750      1847

