# Movie Review Classifier

In [1]:
import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import nltk
nltk.download("wordnet")
nltk.download("punkt")
nltk.download("omw-1.4")
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud

from sklearn.experimental import enable_halving_search_cv
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, make_scorer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.model_selection import HalvingGridSearchCV, GridSearchCV, train_test_split, cross_val_score

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/ikollipara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/ikollipara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/ikollipara/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_csv("movie.csv")
print(df.shape)
df.head(5)

(40000, 2)


Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


## Feature Extraction

In [3]:
X = df['text']
y = df['label']

In [4]:
%%time

lemmatizer = WordNetLemmatizer()
analyzer = CountVectorizer().build_analyzer()

with open("stop-words.txt") as f:
    stop_words = [lemmatizer.lemmatize(w.strip('\n')) for w in f]

def process(review: str):
    lemmatized_review = (lemmatizer.lemmatize(w) for w in nltk.wordpunct_tokenize(review))
    return ' '.join(w for w in lemmatized_review if not w.isdigit())

X = X.map(lambda text: process(text))


binary_vector = CountVectorizer(ngram_range=(1, 2), stop_words=stop_words, binary=True)
count_vector = CountVectorizer(ngram_range=(1, 2), stop_words=stop_words)

X_binary = binary_vector.fit_transform(X)
X_count = binary_vector.fit_transform(X)

CPU times: user 48.1 s, sys: 574 ms, total: 48.6 s
Wall time: 48.7 s


In [5]:
Xb_train, Xb_test, yb_train, yb_test = train_test_split(X_binary, y)
Xc_train, Xc_test, yc_train, yc_test = train_test_split(X_count, y)

In [6]:
%%time

param_grid = {"alpha": [0.001, 0.01, 0.1, 0.2, 0.5, 1, 1.5, 2]}

bernoulli_clf = GridSearchCV(BernoulliNB(), param_grid).fit(Xb_train, yb_train)

print(f"Best Score: {bernoulli_clf.best_score_}")

print(f"Optimal Values: {bernoulli_clf.best_params_}\n")

bernoulli_clf = GridSearchCV(BernoulliNB(), param_grid).fit(Xc_train, yc_train)

print(f"Best Score: {bernoulli_clf.best_score_}")

print(f"Optimal Values: {bernoulli_clf.best_params_}\n")

Best Score: 0.8659666666666667
Optimal Values: {'alpha': 0.2}

Best Score: 0.8701666666666666
Optimal Values: {'alpha': 0.2}

CPU times: user 13.2 s, sys: 4.29 s, total: 17.5 s
Wall time: 17.6 s


In [7]:
ideal_clf_bernoulli = BernoulliNB(alpha=0.1).fit(Xb_train, yb_train)

y_pred = ideal_clf_bernoulli.predict(Xb_test)

print(confusion_matrix(yb_test, y_pred))
print(classification_report(yb_test, y_pred))

ideal_clf_bernoulli = BernoulliNB(alpha=0.1).fit(Xc_train, yc_train)

y_pred = ideal_clf_bernoulli.predict(Xc_test)
print(confusion_matrix(yc_test, y_pred))
print(classification_report(yc_test, y_pred))

[[4532  465]
 [ 812 4191]]
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      4997
           1       0.90      0.84      0.87      5003

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

[[4569  451]
 [ 837 4143]]
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      5020
           1       0.90      0.83      0.87      4980

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [8]:
%%time

param_grid = {"alpha": [0.001, 0.01, 0.1, 0.2, 0.5, 1, 1.05, 1.1, 1.5, 2]}

multinomial_clf = GridSearchCV(MultinomialNB(), param_grid).fit(Xb_train, yb_train)

print(f"Best Score: {multinomial_clf.best_score_}")

print(f"Optimal Values: {multinomial_clf.best_params_}\n")

multinomial_clf = GridSearchCV(MultinomialNB(), param_grid).fit(Xc_train, yc_train)

print(f"Best Score: {multinomial_clf.best_score_}")

print(f"Optimal Values: {multinomial_clf.best_params_}\n")

Best Score: 0.8838666666666667
Optimal Values: {'alpha': 2}

Best Score: 0.8832666666666666
Optimal Values: {'alpha': 2}

CPU times: user 11.4 s, sys: 3.7 s, total: 15.1 s
Wall time: 15.2 s


In [9]:
ideal_clf_multinomial = MultinomialNB(alpha=1.1).fit(Xb_train, yb_train)

y_pred = ideal_clf_multinomial.predict(Xb_test)

print(confusion_matrix(yb_test, y_pred))
print(classification_report(yb_test, y_pred))

ideal_clf_multinomial = MultinomialNB(alpha=1.1).fit(Xc_train, yc_train)

y_pred = ideal_clf_multinomial.predict(Xc_test)

print(confusion_matrix(yc_test, y_pred))
print(classification_report(yc_test, y_pred))

[[4502  495]
 [ 669 4334]]
              precision    recall  f1-score   support

           0       0.87      0.90      0.89      4997
           1       0.90      0.87      0.88      5003

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

[[4535  485]
 [ 656 4324]]
              precision    recall  f1-score   support

           0       0.87      0.90      0.89      5020
           1       0.90      0.87      0.88      4980

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [11]:
%%time

decision_tree_clf = RandomForestClassifier().fit(Xb_train, yb_train)

print(f"Best Score: {decision_tree_clf.best_score_}")

print(f"Optimal Values: {decision_tree_clf.best_params_}\n")

decision_tree_clf = RandomForest().fit(Xc_train, yc_train)

print(f"Best Score: {decision_tree_clf.best_score_}")

print(f"Optimal Values: {decision_tree_clf.best_params_}\n")

Fitting 5 folds for each of 2 candidates, totalling 10 fits


KeyboardInterrupt: 