# Required Packages

In [0]:
import os
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier


# Loading DataSet

In [2]:
def loadDataset(data_dir):
    
    data = {}
    for partition in ["train", "test"]:
        data[partition] = []
        for sentiment in ["neg", "pos"]:
            lable = 1 if sentiment == "pos" else -1

            path = os.path.join(data_dir, partition, sentiment)
            files = os.listdir(path)
            for f_name in files:
                with open(os.path.join(path, f_name), "r") as f:
                    review = f.read()
                    data[partition].append([review, lable])

    np.random.shuffle(data["train"])
    np.random.shuffle(data["test"])
    
    data["train"] = pd.DataFrame(data["train"],
                                 columns=['text', 'sentiment'])
    data["test"] = pd.DataFrame(data["test"],
                                columns=['text', 'sentiment'])

    return data["train"], data["test"]

In [3]:
data_dir = "../DataSet/aclImdb/"
train_data, test_data = loadDataset(data_dir)

# Cleaning Dataset
Removing HTML tags and punctuation as well as Lowering text.

In [4]:
def cleanText(text):
    
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text) 
    text = text.strip().lower()
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((i, " ") for i in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

# Vectorization

# Bag Of Words (BOW)

In [5]:
# BOW
vectorizer = CountVectorizer(stop_words="english",
                             preprocessor=cleanText)

training_features = vectorizer.fit_transform(train_data["text"])    
test_features = vectorizer.transform(test_data["text"])

# BERT Embedding

# Support vector machine (SVM) classifier

# SVM - Linear Kernel

In [6]:
# Training 
model = SVC(kernel = 'linear', C = 1)
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)

In [7]:
# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)

In [8]:
# Result
print("Accuracy: {:.2f}".format(acc*100))
cm = confusion_matrix(test_data["sentiment"],y_pred)
print(cm)
print(classification_report(test_data["sentiment"],y_pred))

Accuracy: 83.31
[[10560  1940]
 [ 2233 10267]]
              precision    recall  f1-score   support

          -1       0.83      0.84      0.84     12500
           1       0.84      0.82      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



# SVM - Gaussian Kernel

In [9]:
# Training
model = SVC(kernel='rbf', gamma='scale')
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)

In [10]:
# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)

In [11]:
# Result
print("Accuracy: {:.2f}".format(acc*100))
cm = confusion_matrix(test_data["sentiment"],y_pred)
print(cm)
print(classification_report(test_data["sentiment"],y_pred))

Accuracy: 86.69
[[10600  1900]
 [ 1427 11073]]
              precision    recall  f1-score   support

          -1       0.88      0.85      0.86     12500
           1       0.85      0.89      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



# SVM - Sigmoid Kernel

In [12]:
# Training
model = SVC(kernel='sigmoid')
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)



In [13]:
# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)

In [14]:
# Result
print("Accuracy: {:.2f}".format(acc*100))
cm = confusion_matrix(test_data["sentiment"],y_pred)
print(cm)
print(classification_report(test_data["sentiment"],y_pred))

Accuracy: 56.88
[[ 2123 10377]
 [  402 12098]]
              precision    recall  f1-score   support

          -1       0.84      0.17      0.28     12500
           1       0.54      0.97      0.69     12500

    accuracy                           0.57     25000
   macro avg       0.69      0.57      0.49     25000
weighted avg       0.69      0.57      0.49     25000



# Naive Bayes classifier

In [0]:
# Training
model = GaussianNB()
model.fit(training_features, train_data["sentiment"]) 

In [0]:
# Evaluation
y_pred = model.predict(test_features) 
acc = accuracy_score(test_data["sentiment"], y_pred) 

In [0]:
# Result
print("Accuracy: {:.2f}".format(acc*100))
cm = confusion_matrix(test_data["sentiment"],y_pred)
print(cm)
print(classification_report(test_data["sentiment"],y_pred))

# Decision tree classifier

In [0]:
# Training
model = DecisionTreeClassifier()
model.fit(training_features, train_data["sentiment"])

In [0]:
# Evaluation
y_pred = model.predict(test_features)
acc = accuracy_score(test_data["sentiment"], y_pred)

In [0]:
# Result
print("Accuracy: {:.2f}".format(acc*100))
cm = confusion_matrix(test_data["sentiment"],y_pred)
print(cm)
print(classification_report(test_data["sentiment"],y_pred))