# Text Classification
> # Sentiment Analysis of IMDB Movie Reviews



Pattern Recognition Course Project #1 @IUST
<br><br>
Hosein Mohebbi<br>
hosein_mohebbi@comp.iust.ac.ir


# Project Overview

Text classification is used in many interesting applications such as spam or non-spam email detection, fake news detection, and sentiment analysis. The main aim of this project is to examine different base classifiers in sentiment analysis task as a text classification problem. Sentiment analysis aims to estimate the sentiment polarity of a body of a text solely on its content. To tackle this problem has been tried any possible combination of four approaches to word embedding (BOW, BERT, TF-IDF, Word2Vec) and four base classifiers (Naive Bayes, SVM, Decision Tree, Random Forest) as well as some text pre-processing techniques on the IMDB movie reviews dataset which contains 50K reviews, half of which are positive and the other half negative. This dataset was compiled by <a href="http://ai.stanford.edu/~amaas/">Andrew Maas</a> and can be find here: <a href="http://ai.stanford.edu/~amaas/data/sentiment/">Large Movie Review Dataset</a>

# Downloading & Installing Prerequisites

In [0]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [0]:
!tar -zxvf aclImdb_v1.tar.gz > /dev/null

In [0]:
!pip3 install bert-embedding

In [0]:
!pip3 install mxnet-cu100

In [0]:
!ls

# Required Packages

In [0]:
import ipywidgets as widgets
import os
import numpy as np
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.tokenize import sent_tokenize, word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet
from nltk.tag import pos_tag
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import itertools
import mxnet as mx
from bert_embedding import BertEmbedding


# Loading DataSet

In [0]:
def loadDataset(data_dir):
    
    data = {}
    for partition in ["train", "test"]:
        data[partition] = []
        for sentiment in ["neg", "pos"]:
            lable = 1 if sentiment == "pos" else -1

            path = os.path.join(data_dir, partition, sentiment)
            files = os.listdir(path)
            for f_name in files:
                with open(os.path.join(path, f_name), "r") as f:
                    review = f.read()
                    data[partition].append([review, lable])

    np.random.shuffle(data["train"])
    np.random.shuffle(data["test"])
    
    data["train"] = pd.DataFrame(data["train"],
                                 columns=['text', 'sentiment'])
    data["test"] = pd.DataFrame(data["test"],
                                columns=['text', 'sentiment'])

    return data["train"], data["test"]

In [0]:
data_dir = "aclImdb/"
train_data, test_data = loadDataset(data_dir)

# Cleaning Dataset
Removing HTML tags and punctuation as well as Lowering text.

In [0]:
# POS Tagging
def NormalizeWithPOS(word_list):
  
    lemmatizer = WordNetLemmatizer() 
    stemmer = PorterStemmer() 
    for word, tag in pos_tag(word_list):
        if tag.startswith('J'):
            w = lemmatizer.lemmatize(word, pos='a')
        elif tag.startswith('V'):
            w = lemmatizer.lemmatize(word, pos='v')
        elif tag.startswith('N'):
            w = lemmatizer.lemmatize(word, pos='n')
        elif tag.startswith('R'):
            w = lemmatizer.lemmatize(word, pos='r')
        else:
            w = word
        w = stemmer.stem(w)
        yield w

In [0]:
def cleanText(text):
    
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r"[0-9]+", ' ', text) # TODO: save 0-10 for IMDB rating
    text = re.sub(r"-", ' ', text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"n't", " not", text)
    
    text = text.strip().lower()
    
    if embedding is 'BOW':
        # Remove Stop words
        default_stop_words = set(stopwords.words('english'))
        default_stop_words.difference_update({'no', 'not', 'nor', 'too', 'any'})
        stop_words = default_stop_words.union({"'m", "n't", "'d", "'re", "'s",
                                               'would','must',"'ve","'ll",'may'})
    
        word_list = word_tokenize(text)
        filtered_list = [w for w in word_list if not w in stop_words]
        text = ' '.join(filtered_list)
    
    # Remove other contractions
    text = re.sub(r"'", ' ', text)
    
    # Replace punctuations with space
    if embedding is 'BERT': # save ! ? . for end of sentences [,/():;]
        filters='"\'#$%&*+-<=>@[\\]^_`{|}~\t\n'
    else:
        filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((i, " ") for i in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    
    text = ' '.join([w for w in text.split() if len(w)>1])
    # Replace multiple space with one space
    text = re.sub(' +', ' ', text)
    
    # Lemmatization & Stemming 
    if embedding is 'BOW':
        text = ' '.join(NormalizeWithPOS(word_tokenize(text)))
  
    return text

In [0]:
# Debugging
txt = train_data.iloc[10]['text']
a = cleanText(txt)
print(txt, end="\n\n")
print(a)

# Vectorization

In [0]:
def mean(z):
    return sum(itertools.chain(z))/len(z)

In [0]:
def embeddToBERT(text):
    sentences = re.split('!|\?|\.',text)
    sentences = list(filter(None, sentences)) 
    result = bert(sentences, 'avg')
    
    bert_vocabs_of_sentence = []
    for sentence in range(len(result)):
        for word in range(len(result[sentence][1])):
            bert_vocabs_of_sentence.append(result[sentence][1][word])

    feature = [mean(x) for x in zip(*bert_vocabs_of_sentence)]
    return feature

In [0]:
ctx = mx.gpu(0)
bert = BertEmbedding(ctx=ctx)

In [0]:
# Cleaning before BERT
embedding = 'BERT'

train_data['clean_text'] = train_data['text'].apply(cleanText)
test_data['clean_text'] = test_data['text'].apply(cleanText)


In [0]:
# BERT Embedding
training_features = train_data['clean_text'].apply(embeddToBERT)
test_features = test_data['clean_text'].apply(embeddToBERT)

In [0]:
feature = [x for x in training_features.transpose()]
training_features = np.asarray(feature)

feature = [x for x in test_features.transpose()]
test_features = np.asarray(feature)

In [0]:
# Debugging
text = train_data.iloc[10]['clean_text']

sentences = re.split('!|\?|\.',text)
sentences = list(filter(None, sentences)) 
result = bert(sentences, 'avg')
    
bert_vocabs_of_sentence = []
for sentence in range(len(result)):
    for word in range(len(result[sentence][1])):
        bert_vocabs_of_sentence.append(result[sentence][1][word])

feature = [mean(x) for x in zip(*bert_vocabs_of_sentence)]

In [0]:
txt = ["these were sent by me","they were happy", "he left"]
r = bert(txt, 'avg')

In [0]:
r1 = r[0][1]

In [0]:
r2 = r[1][1][1]

In [0]:
len(r1[0])

768

In [0]:
l1 = [1.2,2,3,4]
l2 = [10,20,30,40]
l3 = [-2,-1,0,1]

l = []
l.append(l1)
l.append(l2)
l.append(l3)

feature = [mean(x) for x in zip(*l)]
feature

[3.0666666666666664, 7.0, 11.0, 15.0]

In [0]:
#result[sentence][1][word]
result[0]


In [0]:
np.asarray(feature).shape

(25000, 768)

In [0]:
len(result[sentence][1])

3

In [0]:
training_features.shape

(25000, 768)

In [0]:
train_data.head()

In [0]:
# Bag Of Words (BOW)
embedding = 'BOW'
# , max_features=30000
vectorizer = CountVectorizer(preprocessor=cleanText)

training_features_bow = vectorizer.fit_transform(train_data["text"])    
test_features_bow = vectorizer.transform(test_data["text"])

In [0]:
training_features.shape

(25000, 768)

In [0]:
type(training_features)

scipy.sparse.csr.csr_matrix

In [0]:
# BERT Embedding

# Classifiers

# Model & Training & Evaluation

In [0]:
# SVM 
param_grid = [{'kernel': ['rbf'], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [0.1, 1, 10, 100]}]
grid = GridSearchCV(SVC(), param_grid, refit = True, cv=5)

# Training 
grid.fit(training_features, train_data["sentiment"])

# Evaluation
y_pred = grid.predict(test_features)

print(grid.best_params_) 

In [0]:
# Naive Bayes classifier
# use toarray() for BOW embedding
model = GaussianNB()
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)

In [0]:
# Decision tree classifier
model = DecisionTreeClassifier()
model.fit(training_features, train_data["sentiment"])
y_pred = model.predict(test_features)

ValueError: ignored

# Result

In [0]:
acc = accuracy_score(test_data["sentiment"], y_pred)
# Result
print("Accuracy: {:.2f}".format(acc*100))
cm = confusion_matrix(test_data["sentiment"],y_pred)
print(cm)
print(classification_report(test_data["sentiment"],y_pred))

Accuracy: 75.64
[[10032  2468]
 [ 3623  8877]]
              precision    recall  f1-score   support

          -1       0.73      0.80      0.77     12500
           1       0.78      0.71      0.74     12500

    accuracy                           0.76     25000
   macro avg       0.76      0.76      0.76     25000
weighted avg       0.76      0.76      0.76     25000

