# STAT542 Project 4 - Sentiment Analysis for IMDB Movies 

This notebook is to perform sentiment analysis on IMDB moives using natural language processing techniques and ML classification models with sklearn.

## Import packages

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier as XgbC
from nltk.stem import PorterStemmer
import time

## Data cleaning

In [2]:
# for data processing part, in order to increase the speed and have better scalabity, we define a funciton here
def Review2Words(original_review):
    ''' This function take an original movie review and do some cleaning work to turn them into space delimited string
        for later building of bag of words and machine learning models
    '''
    # step 1: if HTML exists, remove them
    review_temp1 = BeautifulSoup(original_review,"lxml").get_text()
    # step 2: remove non alphabatical letters
    review_temp2 = re.sub("[^a-zA-Z]",'',review_temp1)
    # step 3: to lower case and split into signle words
    review_words = review_temp2.lower().split()
    # step 4: make the stop words
    stopWords = set(stopwords.words("english"))
    # step 5: remove stop words
    left_words = [word for word in review_words if word not in stopWords]
    # step 6: form the final space separated string
    #port = PorterStemmer()
    final_string = " ".join(left_words)
    return final_string

In [3]:
# read the training and test files  and check the dimension
train_raw = pd.read_csv('../data/labeledTrainData.tsv',delimiter='\t',header=0,quoting=3)
assert train_raw.shape == (25000, 3)
test_raw  = pd.read_csv('../data/testData.tsv',delimiter='\t',header=0,quoting=3)
assert test_raw.shape == (25000, 2)

# use map to speep up the cleaning process
train_reviews = list(map(Review2Words,train_raw['review']))
test_reviews  = list(map(Review2Words,test_raw['review']))

In [4]:
train_raw['sentiment'][:10]

0    1
1    1
2    0
3    0
4    1
5    1
6    0
7    0
8    0
9    1
Name: sentiment, dtype: int64

## Creating Bag of Words with sklearn

In [5]:
# first to initialize the CounterVectorizer object
features_keep = 5000
my_vectorizer = CountVectorizer(analyzer    = 'word', \
                                tokenizer    = None,   \
                                preprocessor = None,   \
                                stop_words   = None,   \
                                max_features = features_keep )

# fit the vectorizer and transform the training data into vector
train_features = my_vectorizer.fit_transform(train_reviews).toarray()
assert train_features.shape == (25000, features_keep)
test_features  = my_vectorizer.transform(test_reviews).toarray()

## Building Machine Learning models

### Random Forest Model

In [6]:
# first we try the Random forest as demonstrated on Kaggle website

# count the time used
start_t = time.time()
# fit the model
my_rf_model = RandomForestClassifier(n_estimators=100,random_state = 2017,n_jobs = -1) \
            .fit(train_features,train_raw['sentiment'])

# carry out prediction
my_output   = my_rf_model.predict(test_features)

end_t = time.time()
print("Model fitting used {0}".format(end_t - start_t))
result = pd.DataFrame(data={'id':test_raw['id'],'sentiment':my_output})

# write result to files
result.to_csv('../report/bagOfWords_RF2.csv',index=False, quoting=3)

Model fitting used 1245.8103439807892


In [7]:
sum(result['sentiment'])

24976

## XGBoost model

In [8]:
# count the time used
start_t = time.time()
xgb_model = XgbC(seed = 2017,nthread=4)
xgb_model.fit(train_features,train_raw['sentiment'])
# carry out prediction
my_output   = xgb_model.predict(test_features)

end_t = time.time()
print("Model fitting used {0}".format(end_t - start_t))
result = pd.DataFrame(data={'id':test_raw['id'],'sentiment':my_output})

# write result to files
result.to_csv('../report/bagOfWords_Xgb.csv',index=False, quoting=3)

Model fitting used 127.24032092094421


In [9]:
sum(train_raw['sentiment']==1)

12500

In [10]:
sum(result['sentiment']==0)

25000

## Logistic Regression

In [11]:
# count the time used
start_t = time.time()
lr_model = LogisticRegression(random_state = 2017,n_jobs=4)
lr_model.fit(train_features,train_raw['sentiment'])
# carry out prediction
my_output   = lr_model.predict(test_features)

end_t = time.time()
print("Model fitting used {0}".format(end_t - start_t))
result = pd.DataFrame(data={'id':test_raw['id'],'sentiment':my_output})

# write result to files
result.to_csv('../report/bagOfWords_lr.csv',index=False, quoting=3)

Model fitting used 4.542827844619751


In [12]:
sum(result['sentiment']==0)

24