# Sentiment Analysis of IMDB Movie Reviews


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
import os
import warnings

In [2]:
dataset=pd.read_csv('IMDB Dataset.csv')
dataset.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [3]:
print(dataset.shape)

(50000, 2)


In [4]:
dataset.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [5]:
dataset['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

# 01)Cleaning AND Normalizing Data

In [6]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    return text



dataset['review']=dataset['review'].apply(denoise_text)

In [7]:
#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text




dataset['review']=dataset['review'].apply(remove_special_characters)

In [8]:
#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text




dataset['review']=dataset['review'].apply(simple_stemmer)

In [9]:
#removing the stopwords
tokenizer=ToktokTokenizer()
stopword_list=nltk.corpus.stopwords.words('english')

stop=set(stopwords.words('english'))
print(stop)

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text



dataset['review']=dataset['review'].apply(remove_stopwords)

{"she's", 'out', 'here', 'which', 'himself', 'during', 'theirs', 'myself', 'until', 'that', 'down', 'all', 'mightn', "should've", 'its', 'into', 'any', 'for', "wasn't", 'him', 'they', 'itself', 'so', 'over', 'o', 'is', 'i', 'can', "shouldn't", 'these', 'this', 'after', 'each', 'of', 'd', 'at', 'she', 'who', 'having', 'if', "mustn't", 'same', 'then', 'their', 'your', 'didn', 'from', 'and', 'ours', 'than', 'should', 'yourselves', "you're", 'yours', 'again', 'under', "hasn't", "that'll", "isn't", "wouldn't", 'won', 'whom', 'more', 's', "you've", 'll', 'have', 'are', 't', 'm', "needn't", 'few', 'isn', 'shouldn', 'be', 'about', "haven't", 'why', 'doing', "it's", 'nor', 'has', 'hasn', 'up', 'doesn', 'ourselves', 'before', 'some', 'you', 'just', 'yourself', 'below', 'on', 'an', 'there', 'with', 'a', 'was', 'other', 'shan', 'mustn', 'it', 'once', 'by', 'wasn', 'aren', 'weren', 'been', 'not', "mightn't", 've', 'own', 'needn', 'hers', 'while', 'my', "you'll", 'most', 'no', 'themselves', 'couldn'

# Spliting train & test data

In [10]:
features = dataset['review']
target = dataset['sentiment']

In [11]:
from sklearn.model_selection import train_test_split
X_train,X_test = train_test_split(features, test_size = 0.2, random_state=2022)

In [12]:
print("X_train : ",X_train.shape)
print("X_test : ",X_test.shape)

X_train :  (40000,)
X_test :  (10000,)


# Vectorization

In [15]:
# #Count vectorizer for bag of words
# cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
# #transformed train reviews
# cv_X_train=cv.fit_transform(X_train)
# #transformed test reviews
# cv_X_test=cv.transform(X_test)

# print('BOW_cv_train:',cv_X_train.shape)
# print('BOW_cv_test:',cv_X_test.shape)



In [16]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_X_train=tv.fit_transform(X_train)
#transformed test reviews
tv_X_test=tv.transform(X_test)
print('Tfidf_train:',tv_X_train.shape)
print('Tfidf_test:',tv_X_test.shape)

Tfidf_train: (40000, 6221597)
Tfidf_test: (10000, 6221597)


In [17]:
#labeling the sentient data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(target)
print(sentiment_data.shape)

(50000, 1)


In [18]:
y_train,y_test = train_test_split(sentiment_data, test_size = 0.2, random_state=2022)

In [19]:
print("y_train : ",y_train.shape)
print("y_test : ",y_test.shape)

y_train :  (40000, 1)
y_test :  (10000, 1)


# Modeling the dataset

### -Logistic regression is a good model because it trains quickly even on large datasets and provides very robust results. 
### -Other good model choices include SVMs, Random Forests, and Naive Bayes.

In [20]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=2022)

# #Fitting the model for Bag of words
# lr_bow=lr.fit(cv_X_train,y_train)
# print(lr_bow)

#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_X_train,y_train)
print(lr_tfidf)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1, max_iter=500, random_state=2022)


In [21]:
# #Predicting the model for bag of words
# lr_bow_predict=lr.predict(cv_X_test)
# print(lr_bow_predict)

##Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(tv_X_test)
print(lr_tfidf_predict)

[0 0 1 ... 0 1 0]


In [22]:
# #Accuracy score for bag of words
# lr_bow_score=accuracy_score(y_test,lr_bow_predict)
# print("lr_bow_score :",lr_bow_score)

#Accuracy score for tfidf features
lr_tfidf_score=accuracy_score(y_test,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_tfidf_score : 0.7451


In [23]:
# #Classification report for bag of words 
# lr_bow_report=classification_report(y_test,lr_bow_predict,target_names=['Positive','Negative'])
# print(lr_bow_report)

# #Classification report for tfidf features
lr_tfidf_report=classification_report(y_test,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.76      0.72      0.74      5043
    Negative       0.73      0.77      0.75      4957

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.74     10000



In [24]:
# #confusion matrix for bag of words
# cm_bow=confusion_matrix(y_test,lr_bow_predict,labels=[1,0])
# print(cm_bow)

#confusion matrix for tfidf features
cm_tfidf=confusion_matrix(y_test,lr_tfidf_predict,labels=[1,0])
print(cm_tfidf)

[[3808 1149]
 [1400 3643]]
