<a href="https://colab.research.google.com/github/josevlal/CapstoneF21_MovieReviewClassification/blob/main/Sentiment_Analysis_Final_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
from numpy.lib.utils import info
import pandas as pd
import numpy as np
import csv
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from yellowbrick.text import FreqDistVisualizer
from yellowbrick.text import freqdist
from yellowbrick.datasets import load_hobbies
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier


# Load the text data
df = pd.read_csv("IMDB Dataset.csv")

print(df.head())

#Total number of positive and negative sentiments reviews in the dataset:
print(df.sentiment.value_counts())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
negative    25000
positive    25000
Name: sentiment, dtype: int64


In [6]:
#--------------------Pre-processing--------------------#

#Make lowercase
df['new_review'] = df['review'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#Remove the line break character from text
df['new_review'] = df['new_review'].str.replace("<br />",'')

#Remove other characters
df['new_review'] = df['new_review'].str.replace('[^\w\s]','')

#Pre-processed data:
print(df.head())

                                              review  ...                                         new_review
0  One of the other reviewers has mentioned that ...  ...  one of the other reviewers has mentioned that ...
1  A wonderful little production. <br /><br />The...  ...  a wonderful little production the filming tech...
2  I thought this was a wonderful way to spend ti...  ...  i thought this was a wonderful way to spend ti...
3  Basically there's a family where a little boy ...  ...  basically theres a family where a little boy j...
4  Petter Mattei's "Love in the Time of Money" is...  ...  petter matteis love in the time of money is a ...

[5 rows x 3 columns]


In [7]:
#--------------------Text-Cleaning--------------------#

#Remove numbers
df['new_review'] = df['new_review'].str.replace('\d+','', regex = True)
df['new_review'] = df['new_review'].str.replace('_', '')

print(df.head())

                                              review  ...                                         new_review
0  One of the other reviewers has mentioned that ...  ...  one of the other reviewers has mentioned that ...
1  A wonderful little production. <br /><br />The...  ...  a wonderful little production the filming tech...
2  I thought this was a wonderful way to spend ti...  ...  i thought this was a wonderful way to spend ti...
3  Basically there's a family where a little boy ...  ...  basically theres a family where a little boy j...
4  Petter Mattei's "Love in the Time of Money" is...  ...  petter matteis love in the time of money is a ...

[5 rows x 3 columns]


In [8]:
#--------------------Tokens--------------------#

vectorizer = CountVectorizer(stop_words='english')

df_neg = df[df['sentiment'] == "negative"]

df_pos = df[df['sentiment'] == "positive"]

docs_n      = vectorizer.fit_transform(df_neg['new_review'])
features   = vectorizer.get_feature_names()

tokens_n = pd.DataFrame({'features': features})

docs_p      = vectorizer.fit_transform(df_pos['new_review'])
features   = vectorizer.get_feature_names()

tokens_p = pd.DataFrame({'features': features})

df_new = pd.DataFrame({'Overlap_tokens': list(set(tokens_n.features) & set(tokens_p.features))})

#Tokens Summarized:
print(tokens_p)
print(tokens_n)
print(df_new)



           features
0                aa
1               aaa
2       aaaaaaaargh
3           aaaaagh
4       aaaaatchkah
...             ...
137565     übervamp
137566         ünel
137567   ünfaithful
137568           ýs
137569          þór

[137570 rows x 1 columns]
                          features
0                               aa
1                              aaa
2       aaaaaaaaaaaahhhhhhhhhhhhhh
3                         aaaaaaah
4                 aaaaaaahhhhhhggg
...                            ...
133992                  üvegtigris
133993                      üzümcü
133994                 þorleifsson
133995                        יגאל
133996                       כרמון

[133997 rows x 1 columns]
      Overlap_tokens
0              icons
1       dramaanother
2             shiner
3                wht
4            spiders
...              ...
57022          dense
57023         thelma
57024    predictable
57025      repertory
57026     wodehouses

[57027 rows x 1 columns]


In [9]:
#--------------------Multinomial_Naive_Bayes--------------------#
#tokenized similar to above, for the bag of words upto bigrams are considered

tokens = RegexpTokenizer('[a-zA-Z0-9]+')
token_doc = CountVectorizer(stop_words='english', ngram_range=(2,2), tokenizer=tokens.tokenize)
token_count = token_doc.fit_transform(df['new_review'])

#--------------------Train_Test_split--------------------#

X_train, X_test, Y_train, Y_test = train_test_split(token_count, df['sentiment'], test_size = 0.20, random_state=5)

#--------------------Fitting_the_model--------------------#

MNB = MultinomialNB()
MNB.fit(X_train, Y_train)

#--------------------Evalutating_the_model--------------------#

predicted = MNB.predict(X_test)
accuracy_score = metrics.accuracy_score(predicted, Y_test)

conf_mat = confusion_matrix(y_true=Y_test, y_pred=predicted)

#Confusion Matrix
print(conf_mat)

#Accuracy
print("Accuracy:", str('{:04.2f}'.format(accuracy_score*100))+'%')

[[4443  457]
 [ 860 4240]]
Accuracy: 86.83%


In [15]:
#--------------------K Fold Cross Validation--------------------#

from sklearn.model_selection import cross_val_score
cross_val_score(MNB, X_train, Y_train, cv = 10)

array([0.864  , 0.8655 , 0.8605 , 0.86125, 0.86125, 0.86825, 0.87025,
       0.8615 , 0.86325, 0.873  ])

In [None]:
#--------------Decision_Tree_Model---------------#

X_train, X_test, Y_train, Y_test = train_test_split(token_count, df['sentiment'], test_size = 0.20, random_state=5)

classifier = DecisionTreeClassifier()
classifier.fit(X_train, Y_train)
predicted = classifier.predict(X_test)

print(confusion_matrix(Y_test, predicted))
print(classification_report(Y_test, predicted))


[[3507 1393]
 [1283 3817]]
              precision    recall  f1-score   support

    negative       0.73      0.72      0.72      4900
    positive       0.73      0.75      0.74      5100

    accuracy                           0.73     10000
   macro avg       0.73      0.73      0.73     10000
weighted avg       0.73      0.73      0.73     10000

