Hosein Mohebbi
hosein.mohebbi75@gmail.com

# Downloading and Installing 

In [0]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [0]:
!tar -zxvf aclImdb_v1.tar.gz

In [0]:
# !pip3 install bert-embedding

In [0]:
# !pip3 install mxnet-cu100

In [0]:
!ls

# Required Packages

In [0]:
import os
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.tag import pos_tag
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
# import mxnet as mx
# from bert_embedding import BertEmbedding


# Loading DataSet

In [0]:
def loadDataset(data_dir):
    
    data = {}
    for partition in ["train", "test"]:
        data[partition] = []
        for sentiment in ["neg", "pos"]:
            lable = 1 if sentiment == "pos" else -1

            path = os.path.join(data_dir, partition, sentiment)
            files = os.listdir(path)
            for f_name in files:
                with open(os.path.join(path, f_name), "r") as f:
                    review = f.read()
                    data[partition].append([review, lable])

    np.random.shuffle(data["train"])
    np.random.shuffle(data["test"])
    
    data["train"] = pd.DataFrame(data["train"],
                                 columns=['text', 'sentiment'])
    data["test"] = pd.DataFrame(data["test"],
                                columns=['text', 'sentiment'])

    return data["train"], data["test"]

In [0]:
data_dir = "aclImdb/"
train_data, test_data = loadDataset(data_dir)

# Cleaning Dataset
Removing HTML tags and punctuation as well as Lowering text.

In [0]:
# POS Tagging
def NormalizeWithPOS(word_list):
  
    lemmatizer = WordNetLemmatizer() 
    stemmer = PorterStemmer() 
    for word, tag in pos_tag(word_list):
        if tag.startswith('J'):
            w = lemmatizer.lemmatize(word, pos='a')
        elif tag.startswith('V'):
            w = lemmatizer.lemmatize(word, pos='v')
        elif tag.startswith('N'):
            w = lemmatizer.lemmatize(word, pos='n')
        elif tag.startswith('R'):
            w = lemmatizer.lemmatize(word, pos='r')
        else:
            w = word
        w = stemmer.stem(word)
        yield w

In [0]:
def cleanText(text):
    
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r"[0-9]+", ' ', text)
    
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    
    text = text.strip().lower()
    
    # Remove Stop words
    default_stop_words = set(stopwords.words('english'))
    default_stop_words.difference_update({'no', 'not', 'nor', 'too', 'any'})
    stop_words = default_stop_words.union({"'m", "n't", "'d", "'re", "'s",
                                           'would', 'must', "'ve", "'ll", 
                                           'may'})
    
    word_list = word_tokenize(text)
    filtered_list = [w for w in word_list if not w in stop_words]
    text = ' '.join(filtered_list)
    
    # Remove other contractions
    text = re.sub(r"\'", '', text)
    
    # Replace punctuations with space
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((i, " ") for i in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    
    text = ' '.join([w for w in text.split() if len(w)>1])
    # Replace multiple space with one space
    text = re.sub(' +', ' ', text)
    
    # Lemmatization & Stemming 
    text = ' '.join(NormalizeWithPOS(word_tokenize(text)))
  
    return text

In [88]:
# Debugging
txt = train_data.iloc[61]['text']
a = cleanText(txt)
print(txt, end="\n\n")
print(a)

While this outing certainly doesn't live up to its predecessor, it does have more than its share of memorable moments. My personal favorite, just after laying waste to a city block with his "Videodisc Cannon," we see a close up of Nimoy's face. As a single tear sheds from his left eye, we know at that point that Nimoy is more than just a killing machine. The viewer can't help but be pulled into his emotional turmoil and we understand that his previously flat affect was only a facade. Absolute brilliance!!! The sex scenes display a nice balance, carnal, but not pornographic. Afterwards, I felt I had a pretty good understanding of how to work the Magnavision Videodisc Player. Too bad they haven't produced them in over 25 years.

outing certainli not live predecessor share memor moment person favorit lay wast citi block videodisc cannon see close nimoy face singl tear shed left eye know point nimoy kill machin viewer not help pull emot turmoil understand previous flat affect facad absolut

In [0]:
    default_stop_words = set(stopwords.words('english'))
    default_stop_words.difference_update({'no', 'not', 'nor', 'too', 'any'})
    temp = set(
                {'were',
 'he',
 'our'
 'ought',
 'may',
 'they',
 'where',
 "'re",
 'was',
 'must',
 'would',
 'why',
 'has',
 'we',
 'when',
 'should',
 'does',
 "n't",
 'is',
 "'ve",
 'it',
 "'ll",
 'you',
 "'ll",
 'did',
 "'ve",
 'would',
 'that',
 "'d",
 'they',
 "'d",
 'will',
 'there',
 'who',
 'she',
 "'ll",
 'i',
 "'m",
 "'s",
 'you',
 "'re",
 "'ve",
 "'d",
 "'ve",
 "n't"})
    
    temp.difference_update(default_stop_words)
    print(temp)

# Vectorization

In [0]:
# Bag Of Words (BOW)
vectorizer = CountVectorizer(preprocessor=cleanText, max_features=20000)

training_features = vectorizer.fit_transform(train_data["text"])    
test_features = vectorizer.transform(test_data["text"])

In [91]:
training_features.shape

(25000, 50324)

In [0]:
# BERT Embedding

# Classifiers

# Model & Training & Evaluation

In [0]:
# SVM 
param_grid = [{'kernel': ['rbf'], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [0.1, 1, 10, 100]}]
grid = GridSearchCV(SVC(), param_grid, refit = True, cv=5)

# Training 
grid.fit(training_features, train_data["sentiment"])

# Evaluation
y_pred = grid.predict(test_features)

print(grid.best_params_) 

In [0]:
# Naive Bayes classifier
# model = GaussianNB()
# model.fit(training_features.toarray(), train_data["sentiment"])
# y_pred = model.predict(test_features.toarray())

In [0]:
# Decision tree classifier
# model = DecisionTreeClassifier()
# model.fit(training_features, train_data["sentiment"])
# y_pred = model.predict(test_features)

# Result

In [0]:
acc = accuracy_score(test_data["sentiment"], y_pred)
# Result
print("Accuracy: {:.2f}".format(acc*100))
cm = confusion_matrix(test_data["sentiment"],y_pred)
print(cm)
print(classification_report(test_data["sentiment"],y_pred))