## STAT451 Project: IMDb movie review classification and prediction

**Group 12: Isabella Xue, Freya Wan, Shuyuan Jia :)**

In [1]:
import numpy as np # provides a high-performance multidimensional array and tools for its manipulation
import pandas as pd # for data munging, it contains manipulation tools designed to make data analysis fast and easy
import re # Regular Expressions - useful for extracting information from text 
import nltk # Natural Language Tool Kit for symbolic and statistical natural language processing
import spacy # processing and understanding large volumes of text
import string # String module contains some constants, utility function, and classes for string manipulation
import re

# For viz
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
from collections import Counter

In [2]:
# set the font globally
plt.rcParams.update({'font.family':'sans-serif'})


In [3]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [46]:
df.head(20)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [47]:
new_clean_df.head(20)

Unnamed: 0,review,description_lengths,scores,compound,Sentiment
0,One of the other reviewers has mentioned that ...,315,"{'neg': 0.203, 'neu': 0.749, 'pos': 0.048, 'co...",-0.9951,negative
1,A wonderful little production. The filming tec...,160,"{'neg': 0.055, 'neu': 0.768, 'pos': 0.177, 'co...",0.9641,positive
2,I thought this was a wonderful way to spend ti...,168,"{'neg': 0.096, 'neu': 0.708, 'pos': 0.196, 'co...",0.9605,positive
3,Basically there is a family where a little boy...,135,"{'neg': 0.141, 'neu': 0.792, 'pos': 0.067, 'co...",-0.9213,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",228,"{'neg': 0.053, 'neu': 0.795, 'pos': 0.152, 'co...",0.9744,positive
5,"Probably my all-time favorite movie, a story o...",125,"{'neg': 0.017, 'neu': 0.763, 'pos': 0.221, 'co...",0.9828,positive
6,I sure would like to see a resurrection of a u...,163,"{'neg': 0.024, 'neu': 0.865, 'pos': 0.111, 'co...",0.9081,positive
7,"This show was an amazing, fresh & innovative i...",178,"{'neg': 0.147, 'neu': 0.658, 'pos': 0.195, 'co...",0.8596,positive
8,Encouraged by the positive comments about this...,128,"{'neg': 0.167, 'neu': 0.66, 'pos': 0.173, 'com...",0.2362,negative
9,If you like original gut wrenching laughter yo...,32,"{'neg': 0.098, 'neu': 0.511, 'pos': 0.391, 'co...",0.9149,positive


In [52]:
sentiment_analyzer_score(new_clean_df['review'][10])

Phil the Alien is one of those quirky films where the humour is based around the oddness of everything rather than actual punchlines.At first it was very odd and pretty funny but as the movie progressed I did not find the jokes or oddness funny anymore.Its a low budget film (that is never a problem in itself), there were some pretty interesting characters, but eventually I just lost interest.I imagine this film would appeal to a stoner who is currently partaking.For something similar but better try "Brother from another planet" {'neg': 0.088, 'neu': 0.679, 'pos': 0.233, 'compound': 0.9482}


#### VADER sentiment analysis

In [5]:
import nltk
import ssl
#ssl._create_default_https_context = ssl._create_unverified_context
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

In [42]:
# Using polarity scores for knowing the polarity of each text
def sentiment_analyzer_score(sentence):
    score = analyser.polarity_scores(sentence)
    print("{:-<40} {}".format(sentence, str(score)))
    
#testing the function
tweet  = "the movie is visually stunning to watch and the casting was just perfect!!"
tweet2 = "I can't believe that I spend frikin 4 hours watching this horrible movie. A waste of my time!!"
print(sentiment_analyzer_score(tweet))
print(sentiment_analyzer_score(tweet2))

I can't believe that I spend frikin 4 hours watching this horrible movie. A waste of my time!! {'neg': 0.385, 'neu': 0.615, 'pos': 0.0, 'compound': -0.8181}
None
the movie is visually stunning to watch and the casting was just perfect!! {'neg': 0.0, 'neu': 0.615, 'pos': 0.385, 'compound': 0.7835}
None


### Data processing

In [7]:
import contractions  # This package is used to replace the contractions in English language with their actual forms
from tqdm import tqdm  # tqdm is used to display the percentage of work done by a for loop.
import re
clean_reviews=[]

for i in tqdm(df['review']):
    # Regular expression that removes all the html tags pressent in the reviews
    i = re.sub('(<[\w\s]*/?>)',"",i)
    # Expanding all the contractions present in the review to is respective actual form
    i = contractions.fix(i)
    clean_reviews.append(i)
    
# Create a new datafram using the clean_reviews
new_clean_df = pd.DataFrame({'review':clean_reviews})
new_clean_df.head()

100%|██████████| 50000/50000 [00:03<00:00, 14002.77it/s]


Unnamed: 0,review
0,One of the other reviewers has mentioned that ...
1,A wonderful little production. The filming tec...
2,I thought this was a wonderful way to spend ti...
3,Basically there is a family where a little boy...
4,"Petter Mattei's ""Love in the Time of Money"" is..."


In [8]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
words_descriptions = new_clean_df['review'].apply(tokenizer.tokenize)
words_descriptions.head()

0    [One, of, the, other, reviewers, has, mentione...
1    [A, wonderful, little, production, The, filmin...
2    [I, thought, this, was, a, wonderful, way, to,...
3    [Basically, there, is, a, family, where, a, li...
4    [Petter, Mattei, s, Love, in, the, Time, of, M...
Name: review, dtype: object

In [9]:
#When we split description into individual words, we have to create vocabulary and additionaly 
#we can add new feature - description lengths
all_words = [word for tokens in words_descriptions for word in tokens]
new_clean_df['description_lengths']= [len(tokens) for tokens in words_descriptions]
VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(VOCAB)))

11770417 words total, with a vocabulary size of 132125


### Model

In [11]:
new_clean_df['scores'] = new_clean_df['review'].apply(lambda review: analyser.polarity_scores(review))

In [16]:
new_clean_df.head()

Unnamed: 0,review,description_lengths,scores,compound,Sentiment
0,One of the other reviewers has mentioned that ...,315,"{'neg': 0.203, 'neu': 0.749, 'pos': 0.048, 'co...",-0.9951,negative
1,A wonderful little production. The filming tec...,160,"{'neg': 0.055, 'neu': 0.768, 'pos': 0.177, 'co...",0.9641,positive
2,I thought this was a wonderful way to spend ti...,168,"{'neg': 0.096, 'neu': 0.708, 'pos': 0.196, 'co...",0.9605,positive
3,Basically there is a family where a little boy...,135,"{'neg': 0.141, 'neu': 0.792, 'pos': 0.067, 'co...",-0.9213,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",228,"{'neg': 0.053, 'neu': 0.795, 'pos': 0.152, 'co...",0.9744,positive


In [13]:
new_clean_df['compound']  = new_clean_df['scores'].apply(lambda score_dict: score_dict['compound'])
new_clean_df['Sentiment'] = new_clean_df['compound'].apply(lambda c: 'positive' if c >=0.3  else 'negative')

new_clean_df.head(10)

Unnamed: 0,review,description_lengths,scores,compound,Sentiment
0,One of the other reviewers has mentioned that ...,315,"{'neg': 0.203, 'neu': 0.749, 'pos': 0.048, 'co...",-0.9951,negative
1,A wonderful little production. The filming tec...,160,"{'neg': 0.055, 'neu': 0.768, 'pos': 0.177, 'co...",0.9641,positive
2,I thought this was a wonderful way to spend ti...,168,"{'neg': 0.096, 'neu': 0.708, 'pos': 0.196, 'co...",0.9605,positive
3,Basically there is a family where a little boy...,135,"{'neg': 0.141, 'neu': 0.792, 'pos': 0.067, 'co...",-0.9213,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",228,"{'neg': 0.053, 'neu': 0.795, 'pos': 0.152, 'co...",0.9744,positive
5,"Probably my all-time favorite movie, a story o...",125,"{'neg': 0.017, 'neu': 0.763, 'pos': 0.221, 'co...",0.9828,positive
6,I sure would like to see a resurrection of a u...,163,"{'neg': 0.024, 'neu': 0.865, 'pos': 0.111, 'co...",0.9081,positive
7,"This show was an amazing, fresh & innovative i...",178,"{'neg': 0.147, 'neu': 0.658, 'pos': 0.195, 'co...",0.8596,positive
8,Encouraged by the positive comments about this...,128,"{'neg': 0.167, 'neu': 0.66, 'pos': 0.173, 'com...",0.2362,negative
9,If you like original gut wrenching laughter yo...,32,"{'neg': 0.098, 'neu': 0.511, 'pos': 0.391, 'co...",0.9149,positive


In [14]:
from sklearn import metrics
print('Accuracy:  {:2.2%} '.format(metrics.accuracy_score(df['sentiment'], new_clean_df['Sentiment'])))
print('Precision: {:2.2%} '.format(metrics.precision_score(df['sentiment'], new_clean_df['Sentiment'], average='weighted')))
print('Recall:    {:2.2%} '.format(metrics.recall_score(df['sentiment'], new_clean_df['Sentiment'], average='weighted')))
print('F1 Score:  {:2.2%} '.format(metrics.f1_score(df['sentiment'], new_clean_df['Sentiment'], average='weighted')))

Accuracy:  70.52% 
Precision: 72.02% 
Recall:    70.52% 
F1 Score:  70.01% 


In [53]:
print(metrics.classification_report(y_true=df['sentiment'], y_pred=new_clean_df['Sentiment'], target_names=['positive', 'negative']) )

              precision    recall  f1-score   support

    positive       0.78      0.57      0.66     25000
    negative       0.66      0.84      0.74     25000

    accuracy                           0.71     50000
   macro avg       0.72      0.71      0.70     50000
weighted avg       0.72      0.71      0.70     50000

