# iMDb Movie Review Sentiment Analysis

## Importing Libraries

In [1]:
import urllib
import os
import tarfile
import re
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt

## Loading Data

In [2]:
DOWNLOAD_PATH = os.path.abspath("C:\\Storage\\JupyterLab\\Projects\\Machine Learning\\imdb_review_classificaton\\data")
DOWNLOAD_URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

In [3]:
"""
Download the data from the given source URL
and extract it.
"""

def download_data(download_path = DOWNLOAD_PATH, download_url = DOWNLOAD_URL):
    
    # Creating the directory if it does not exist
    os.makedirs(download_path, exist_ok=True)
    
    # Creating the path to download the data
    imdb_path = os.path.join(download_path, "imdb_data.tar.gz")
    
    # Downloading data
    urllib.request.urlretrieve(download_url, imdb_path)
    
    # Opening the tar file
    imdb_tar = tarfile.open(imdb_path)
    
    # Extracting the tar file
    imdb_tar.extractall(path=download_path)
    
    # Closing the connection to the file
    imdb_tar.close()

In [4]:
# Downloading data and extracting.
download_data()

In [5]:
# Training set path
train_neg_path = os.path.abspath('../data/aclImdb/train/neg/')
train_pos_path = os.path.abspath('../data/aclImdb/train/pos/')

# Test set path
test_neg_path = os.path.abspath('../data/aclImdb/test/neg/')
test_pos_path = os.path.abspath('../data/aclImdb/test/pos/')

In [6]:
"""
Read all the files from the downloaded data
and label them accordingly.
"""

def read_data_from_files(train_neg_path = train_neg_path, train_pos_path = train_pos_path,
    test_neg_path = test_neg_path, test_pos_path = test_pos_path):
    
    # Training and testing set
    test_data = []
    train_data = []
    
    # Building train set
    for path, label in ((train_neg_path, 0), (train_pos_path, 1)):
        
        # List all files in the given path
        for filepath in os.listdir(path):
            
            # Open the file present
            with open(os.path.join(path, filepath), "rb") as file:
                
                # Append the file along with the label
                train_data.append([file.read(), label])
    
    # Building the test set
    for path, label in ((test_neg_path, 0), (test_pos_path, 1)):
        
        # List all files in the given path
        for filepath in os.listdir(path):
            
            # Open the file present
            with open(os.path.join(path, filepath), "rb") as file:
                
                # Append the file along with the label
                test_data.append([file.read(), label])
                
    # Return both train and test set
    return train_data, test_data

In [7]:
# Generating Test and Train set
train_, test_ = read_data_from_files()

In [8]:
len(train_), len(test_)

(25000, 25000)

In [9]:
train_set = np.array(train_)
test_set = np.array(test_)

train_set.shape, test_set.shape

((25000, 2), (25000, 2))

In [10]:
# Getting thw features and labels segregated
X_train, X_test, y_train, y_test = train_set[:, 0], test_set[:, 0], train_set[:, 1], test_set[:, 1]

y_train = y_train.astype(np.uint8)
y_test = y_test.astype(np.uint8)

In [11]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

In [12]:
X_train[1]

b"Airport '77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP's to his estate in preparation of it being opened to the public as a museum, also on board is Stevens daughter Julie (Kathleen Quinlan) & her son. The luxury jetliner takes off as planned but mid-air the plane is hi-jacked by the co-pilot Chambers (Robert Foxworth) & his two accomplice's Banker (Monte Markham) & Wilson (Michael Pataki) who knock the passengers & crew out with sleeping gas, they plan to steal the valuable cargo & land on a disused plane strip on an isolated island but while making his descent Chambers almost hits an oil rig in the Ocean & loses control of the plane sending it crashing into the sea where it sinks to the bottom right bang in the middle of the Bermuda Triangle. With air in short supply, water leaking in & having flown over 200 miles off course the problems mount for

In [13]:
X_test[-1]

b"I've seen this story before but my kids haven't. Boy with troubled past joins military, faces his past, falls in love and becomes a man. The mentor this time is played perfectly by Kevin Costner; An ordinary man with common everyday problems who lives an extraordinary conviction, to save lives. After losing his team he takes a teaching position training the next generation of heroes. The young troubled recruit is played by Kutcher. While his scenes with the local love interest are a tad stiff and don't generate enough heat to melt butter, he compliments Costner well. I never really understood Sela Ward as the neglected wife and felt she should of wanted Costner to quit out of concern for his safety as opposed to her selfish needs. But her presence on screen is a pleasure. The two unaccredited stars of this movie are the Coast Guard and the Sea. Both powerful forces which should not be taken for granted in real life or this movie. The movie has some slow spots and could have used the 

In [14]:
y_train[-1]

1

In [15]:
y_test[-1]

1

## Data Cleaning

In [16]:
"""
Convert all unneccesary characters like special characters,
numbers and html tags to mere spaces.
"""

from sklearn.base import BaseEstimator, TransformerMixin

class ConvertAllToTextTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, remove_html = True, keep_only_alphabets = True, lower_characters = True):
        self.lower_characters = lower_characters
        self.remove_html = remove_html
        self.keep_only_alphabets = keep_only_alphabets
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        
        # Final transformed text array.
        transformed = []
        
        # Looping over all reviews.
        for review in X:
            
            # Creating a copy
            text = review
            
            # Case 1: Lowering all characters
            if self.lower_characters:
                text = text.lower()
                
            # Case 2: Remove all possible HTML tags
            if self.remove_html:
                text = re.sub(b"</?[a-z]*\s*/?>", b' ', text, flags= re.IGNORECASE | re.MULTILINE)
                
            # Case 3: Remove all special characters and numbers.
            if self.keep_only_alphabets:
                text = re.sub(b"[^a-z]", b' ', text, flags= re.IGNORECASE)
            
            # Replace all extra spaces to single space
            text = re.sub(b"\s+", b' ', text)
            
            # Append to the transformed array.
            transformed.append(text)
        
        # Return clean data
        return np.array(transformed)

In [17]:
X_few = X_train[:3]
transformer = ConvertAllToTextTransformer()
X_few_transformed = transformer.fit_transform(X_few)

In [18]:
X_few

array([b"Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.",
       b"Airport '77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP's to his estate in preparation of it being opened to the public as a museum, also on board is Stevens daughter Julie (Kathleen Quinl

In [19]:
X_few_transformed

array([b'story of a man who has unnatural feelings for a pig starts out with a opening scene that is a terrific example of absurd comedy a formal orchestra audience is turned into an insane violent mob by the crazy chantings of it s singers unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting even those from the era should be turned off the cryptic dialogue would make shakespeare seem easy to a third grader on a technical level it s better than you might think with some good cinematography by future great vilmos zsigmond future stars sally kirkland and frederic forrest can be seen briefly ',
       b'airport starts as a brand new luxury plane is loaded up with valuable paintings such belonging to rich businessman philip stevens james stewart who is flying them a bunch of vip s to his estate in preparation of it being opened to the public as a museum also on board is stevens daughter julie kathleen quinlan her son the luxury je

In [20]:
"""
Converting all words to vectors
of counts of words in the sentence
using CountVectorizer
"""

from sklearn.feature_extraction.text import CountVectorizer

#Creating a CountVectorizer object
count_vect = CountVectorizer()

# Transforming data
X_few_counts = count_vect.fit_transform(X_few_transformed)

# Displaying shape
X_few_counts.shape

(3, 506)

In [21]:
"""
Converting all occurences to
frequencies with TfidfTransformer
"""

from sklearn.feature_extraction.text import TfidfTransformer

#Creating a TfidfTransformer object
tfidf_ = TfidfTransformer()

# Transforming data
X_few_tfidf= tfidf_.fit_transform(X_few_counts)

# Displaying shape
X_few_tfidf.shape

(3, 506)

## Building A Model

In [22]:
"""
Building a pipeline for data cleaning
and prediction to take place simultaneously
"""

from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

# Building a text classification pipeline
text_clf = Pipeline([
    ('convert_all_2_text', ConvertAllToTextTransformer()),
    ('count_vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

In [23]:
# Training the classifier
text_clf.fit(X_train, y_train)

Pipeline(steps=[('convert_all_2_text', ConvertAllToTextTransformer()),
                ('count_vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

In [24]:
from sklearn.metrics import accuracy_score

# Getting predicted data
y_pred = text_clf.predict(X_test)

# Accuracy Score
accuracy_score(y_pred=y_pred, y_true=y_test)

0.83108

In [25]:
from sklearn.metrics import classification_report

print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           0       0.80      0.89      0.84     12500
           1       0.87      0.77      0.82     12500

    accuracy                           0.83     25000
   macro avg       0.84      0.83      0.83     25000
weighted avg       0.84      0.83      0.83     25000



In [26]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_pred=y_pred, y_true=y_test)

array([[11116,  1384],
       [ 2839,  9661]], dtype=int64)

In [27]:
from sklearn.linear_model import SGDClassifier

sgd_clf = Pipeline([
    ('convert_all_2_text', ConvertAllToTextTransformer()),
    ('count_vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge',
                          penalty='l2',
                          alpha=1e-3,
                          random_state=42,
                          max_iter=5,
                          tol=None)),
])

In [28]:
# Training the classifier
sgd_clf.fit(X_train, y_train)

Pipeline(steps=[('convert_all_2_text', ConvertAllToTextTransformer()),
                ('count_vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [29]:
from sklearn.metrics import accuracy_score

# Getting predicted data
y_pred = sgd_clf.predict(X_test)

# Accuracy Score
accuracy_score(y_pred=y_pred, y_true=y_test)

0.84464

In [30]:
from sklearn.ensemble import RandomForestClassifier

for_clf = Pipeline([
    ('convert_all_2_text', ConvertAllToTextTransformer()),
    ('count_vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier(n_estimators=500))
])

In [31]:
# Training the classifier
for_clf.fit(X_train, y_train)

Pipeline(steps=[('convert_all_2_text', ConvertAllToTextTransformer()),
                ('count_vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(n_estimators=500))])

In [32]:
from sklearn.metrics import accuracy_score

# Getting predicted data
y_pred = for_clf.predict(X_test)

# Accuracy Score
accuracy_score(y_pred=y_pred, y_true=y_test)

0.852

In [33]:
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85     12500
           1       0.85      0.85      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



In [34]:
confusion_matrix(y_pred=y_pred, y_true=y_test)

array([[10673,  1827],
       [ 1873, 10627]], dtype=int64)

## Hyperparameter Tuning

In [35]:
# Creating a parameter search space
params = {
    'clf__n_estimators': [50, 100, 200, 300, 400, 500, 600, 700],
    'clf__max_depth': range(10, 110, 10),
    'clf__min_samples_split': range(1, 11),
    'clf__min_samples_leaf': range(1, 11),
}

In [38]:
from sklearn.model_selection import RandomizedSearchCV

# Getting the best model out of the specified parameter search space
final_clf = RandomizedSearchCV(for_clf, param_distributions=params, scoring='roc_auc', cv=3, n_jobs=-1, verbose=3)

# Training
final_clf.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 out of  30 | elapsed:  4.8min remaining:   44.4s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  5.1min finished


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('convert_all_2_text',
                                              ConvertAllToTextTransformer()),
                                             ('count_vect', CountVectorizer()),
                                             ('tfidf', TfidfTransformer()),
                                             ('clf',
                                              RandomForestClassifier(n_estimators=500))]),
                   n_jobs=-1,
                   param_distributions={'clf__max_depth': range(10, 110, 10),
                                        'clf__min_samples_leaf': range(1, 11),
                                        'clf__min_samples_split': range(1, 11),
                                        'clf__n_estimators': [50, 100, 200, 300,
                                                              400, 500, 600,
                                                              700]},
                   scoring='roc_auc

In [39]:
# Getting the best classifier
classifier = final_clf.best_estimator_

In [40]:
# Getting predicted data
y_pred = classifier.predict(X_test)

# Accuracy Score
accuracy_score(y_pred=y_pred, y_true=y_test)

0.8488

In [41]:
confusion_matrix(y_pred=y_pred, y_true=y_test)

array([[10612,  1888],
       [ 1892, 10608]], dtype=int64)

In [42]:
print(classification_report(y_pred=y_pred, y_true=y_test))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85     12500
           1       0.85      0.85      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



## Persisting the model

In [43]:
import pickle

classifier_pickle_path = 'random_forest_classifier_20202911.pkl'
classifier_pickle = open(classifier_pickle_path, 'wb')
pickle.dump(classifier, classifier_pickle)

classifier_pickle.close()