# Downloading and Installing 

In [0]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [0]:
!tar -zxvf aclImdb_v1.tar.gz

# Required Packages

In [0]:
import os
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier


# Loading DataSet

In [0]:
def loadDataset(data_dir):
    
    data = {}
    for partition in ["train", "test"]:
        data[partition] = []
        for sentiment in ["neg", "pos"]:
            lable = 1 if sentiment == "pos" else -1

            path = os.path.join(data_dir, partition, sentiment)
            files = os.listdir(path)
            for f_name in files:
                with open(os.path.join(path, f_name), "r") as f:
                    review = f.read()
                    data[partition].append([review, lable])

    np.random.shuffle(data["train"])
    np.random.shuffle(data["test"])
    
    data["train"] = pd.DataFrame(data["train"],
                                 columns=['text', 'sentiment'])
    data["test"] = pd.DataFrame(data["test"],
                                columns=['text', 'sentiment'])

    return data["train"], data["test"]

In [0]:
data_dir = "aclImdb/"
train_data, test_data = loadDataset(data_dir)

# Cleaning Dataset
Removing HTML tags and punctuation as well as Lowering text.

In [0]:
def cleanText(text):
    
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text) 
    text = text.strip().lower()
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((i, " ") for i in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

# Vectorization

In [0]:
# Bag Of Words (BOW)
vectorizer = CountVectorizer(stop_words="english",
                             preprocessor=cleanText)

training_features = vectorizer.fit_transform(train_data["text"])    
test_features = vectorizer.transform(test_data["text"])

In [0]:
# BERT Embedding

# Classifiers

# Model

In [0]:
# SVM - Linear Kernel
#model = SVC(kernel = 'linear', C = 1)

In [0]:
# SVM - Gaussian Kernel
#model = SVC(kernel='rbf', gamma='scale')

In [0]:
# SVM - Sigmoid Kernel
#model = SVC(kernel='sigmoid', gamma='scale')

In [0]:
# Naive Bayes classifier
model = GaussianNB()

In [0]:
# Decision tree classifier
#model = DecisionTreeClassifier()

In [19]:
a = training_features[0].toarray()

(1, 79999)

# Training & Evaluation

In [0]:
# Training 
model.fit(training_features.toarray(), train_data["sentiment"])
y_pred = model.predict(test_features.toarray())

In [0]:
# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)

# Result

In [0]:
# Result
print("Accuracy: {:.2f}".format(acc*100))
cm = confusion_matrix(test_data["sentiment"],y_pred)
print(cm)
print(classification_report(test_data["sentiment"],y_pred))