# Required Packages

In [1]:
import os
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix

[nltk_data] Downloading package punkt to /home/hosein/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Loading DataSet

In [2]:
def loadDataset(data_dir):
    
    data = {}
    for partition in ["train", "test"]:
        data[partition] = []
        for sentiment in ["neg", "pos"]:
            lable = 1 if sentiment == "pos" else -1

            path = os.path.join(data_dir, partition, sentiment)
            files = os.listdir(path)
            for f_name in files:
                with open(os.path.join(path, f_name), "r") as f:
                    review = f.read()
                    data[partition].append([review, lable])

    np.random.shuffle(data["train"])
    np.random.shuffle(data["test"])
    
    data["train"] = pd.DataFrame(data["train"],
                                 columns=['text', 'sentiment'])
    data["test"] = pd.DataFrame(data["test"],
                                columns=['text', 'sentiment'])

    return data["train"], data["test"]

In [3]:
data_dir = "../DataSet/aclImdb/"
train_data, test_data = loadDataset(data_dir)

# Cleaning Dataset
Removing HTML tags and punctuation as well as Lowering text.

In [4]:
def cleanText(text):
    
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text) 
    text = text.strip().lower()
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((i, " ") for i in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

# Vectorization

Bag Of Words (BOW)

In [9]:
# BOW
vectorizer = CountVectorizer(stop_words="english",
                             preprocessor=cleanText)

training_features = vectorizer.fit_transform(train_data["text"])    
test_features = vectorizer.transform(test_data["text"])

# SVM - Linear Kernel

In [14]:
# Training 
model = SVC(kernel = 'linear', C = 1)
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)

In [29]:
# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)

In [30]:
# Result
print("Accuracy: {:.2f}".format(acc*100))
cm = confusion_matrix(test_data["sentiment"],y_pred)
print(cm)
print(classification_report(test_data["sentiment"],y_pred))

Accuracy: 83.304
[[10559  1941]
 [ 2233 10267]]
              precision    recall  f1-score   support

          -1       0.83      0.84      0.83     12500
           1       0.84      0.82      0.83     12500

    accuracy                           0.83     25000
   macro avg       0.83      0.83      0.83     25000
weighted avg       0.83      0.83      0.83     25000



# SVM - Gaussian Kernel

In [33]:
# Training
model = SVC(kernel='rbf', gamma='scale')
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)



In [34]:
# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)

In [35]:
# Result
print("Accuracy: {:.2f}".format(acc*100))
cm = confusion_matrix(test_data["sentiment"],y_pred)
print(cm)
print(classification_report(test_data["sentiment"],y_pred))

Accuracy: 63.664
[[ 4291  8209]
 [  875 11625]]
              precision    recall  f1-score   support

          -1       0.83      0.34      0.49     12500
           1       0.59      0.93      0.72     12500

    accuracy                           0.64     25000
   macro avg       0.71      0.64      0.60     25000
weighted avg       0.71      0.64      0.60     25000



# SVM - Sigmoid Kernel

In [None]:
# Training
model = SVC(kernel='sigmoid')
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)

In [None]:
# Evaluation
acc = accuracy_score(test_data["sentiment"], y_pred)

In [None]:
# Result
print("Accuracy: {:.2f}".format(acc*100))
cm = confusion_matrix(test_data["sentiment"],y_pred)
print(cm)
print(classification_report(test_data["sentiment"],y_pred))