<a href="https://colab.research.google.com/github/het1752/Natural-Language-Processing/blob/main/IMDB_Review_Analyis_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Dataset Link : https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

About Dataset:

IMDB dataset having 50K movie reviews for natural language processing or Text analytics.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training and 25,000 for testing. So, predict the number of positive and negative reviews using either classification or deep learning algorithms.

For more dataset information, please go through the following link,
http://ai.stanford.edu/~amaas/data/sentiment/

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('/content/IMDB Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
data.shape

(50000, 2)

In [4]:
data['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [5]:
data['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [6]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [7]:
data.duplicated().sum()

418

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
#check it
data.duplicated().sum()

0

In [10]:
#remove tags in dataset
import re
def remove_tags(text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', text)
    return cleaned_text

In [11]:
data['review'] = data['review'].apply(remove_tags)

In [12]:
#convert into lowetext
data['review'] = data['review'].apply(lambda x:x.lower())

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
#remove stopwords
from nltk.corpus import stopwords
sw_list = stopwords.words('english')
data['review'] = data['review'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [15]:
#-remove punctuation
import string
punc=string.punctuation
#print("punctuation:",punc)
def rempunc(text):
  for x in punc:
    text=text.replace(x,'')
  return text

In [16]:
data['review'] = data['review'].apply(rempunc)

In [17]:
#-Remove URLs:
paturl = re.compile('\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*') 
def removeurl(text):
  cleantxt = re.sub(paturl, '', text)
  return cleantxt

In [18]:
data['review'] = data['review'].apply(removeurl)

In [19]:
x = data.iloc[:,0:1]
y = data['sentiment']

In [20]:
x.head()

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production filming technique ...
2,thought wonderful way spend time hot summer we...
3,basically theres family little boy jake thinks...
4,petter matteis love time money visually stunni...


In [21]:
y.head()

0    positive
1    positive
2    positive
3    negative
4    positive
Name: sentiment, dtype: object

In [22]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [23]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=40)

In [24]:
x_train.shape,y_train.shape,x_test.shape,y_test.shape

((39665, 1), (39665,), (9917, 1), (9917,))

In [25]:
# Applying Bag of words 
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000)

In [26]:
x_train_bow = cv.fit_transform(x_train['review']).toarray()
x_test_bow = cv.transform(x_test['review']).toarray()

In [27]:
import sklearn
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
model0 = RandomForestClassifier()
model0.fit(x_train_bow,y_train)
pred_y0=model0.predict(x_test_bow)
print(confusion_matrix(y_test,pred_y0))
pred_df=pd.DataFrame({"Actual Value ":y_test,"Predicted Value":pred_y0})
print(pred_df)
print('Precision:', sklearn.metrics.precision_score(y_test,pred_y0))
print('recall:', sklearn.metrics.recall_score(y_test,pred_y0))
print('f1_score:', sklearn.metrics.f1_score(y_test,pred_y0))

[[4149  782]
 [ 796 4190]]
      Actual Value   Predicted Value
0                 1                1
1                 0                0
2                 0                0
3                 1                1
4                 1                1
...             ...              ...
9912              0                0
9913              0                0
9914              0                0
9915              0                0
9916              0                0

[9917 rows x 2 columns]
Precision: 0.8427192276749799
recall: 0.8403529883674288
f1_score: 0.8415344446676039


In [28]:
#decision tree
from sklearn.tree import DecisionTreeClassifier
model1=DecisionTreeClassifier()
model1.fit(x_train_bow,y_train)
pred_y1=model1.predict(x_test_bow)
print(confusion_matrix(y_test,pred_y1))
pred_df1=pd.DataFrame({"Actual Value ":y_test,"Predicted Value":pred_y1})
print(pred_df1)
print('Precision:', sklearn.metrics.precision_score(y_test,pred_y1))
print('recall:', sklearn.metrics.recall_score(y_test,pred_y1))
print('f1_score:', sklearn.metrics.f1_score(y_test,pred_y1))

[[3513 1418]
 [1469 3517]]
      Actual Value   Predicted Value
0                 1                1
1                 0                0
2                 0                1
3                 1                1
4                 1                1
...             ...              ...
9912              0                0
9913              0                0
9914              0                0
9915              0                0
9916              0                0

[9917 rows x 2 columns]
Precision: 0.7126646403242148
recall: 0.7053750501403931
f1_score: 0.7090011087591978


In [29]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB

In [30]:
#gausiian naive bias classifier
model3 = GaussianNB()
model3.fit(x_train_bow,y_train)
pred_y3=model3.predict(x_test_bow)
print(confusion_matrix(y_test,pred_y3))
pred_df2=pd.DataFrame({"Actual Value ":y_test,"Predicted Value":pred_y3})
print(pred_df2)
print('Precision:', sklearn.metrics.precision_score(y_test,pred_y3,average='weighted'))
print('recall:', sklearn.metrics.recall_score(y_test,pred_y3,average='weighted'))
print('f1_score:', sklearn.metrics.f1_score(y_test,pred_y3,average='weighted'))

[[4172  759]
 [1625 3361]]
      Actual Value   Predicted Value
0                 1                1
1                 0                0
2                 0                0
3                 1                0
4                 1                1
...             ...              ...
9912              0                0
9913              0                1
9914              0                0
9915              0                0
9916              0                0

[9917 rows x 2 columns]
Precision: 0.7679961172299591
recall: 0.7596047191691035
f1_score: 0.7578764225783817


In [31]:
#multinomial naive bias classifier
model4 = MultinomialNB()
model4.fit(x_train_bow,y_train)
pred_y4=model4.predict(x_test_bow)
print(confusion_matrix(y_test,pred_y4))
pred_df3=pd.DataFrame({"Actual Value ":y_test,"Predicted Value":pred_y4})
print(pred_df3)
print('Precision:', sklearn.metrics.precision_score(y_test,pred_y4,average='weighted'))
print('recall:', sklearn.metrics.recall_score(y_test,pred_y4,average='weighted'))
print('f1_score:', sklearn.metrics.f1_score(y_test,pred_y4,average='weighted'))

[[4157  774]
 [ 772 4214]]
      Actual Value   Predicted Value
0                 1                1
1                 0                0
2                 0                0
3                 1                1
4                 1                1
...             ...              ...
9912              0                0
9913              0                0
9914              0                0
9915              0                0
9916              0                0

[9917 rows x 2 columns]
Precision: 0.8441057877084033
recall: 0.8441060804678834
f1_score: 0.8441058997555687


In [32]:
#bernuli naive bias classifier
model5 = BernoulliNB()
model5.fit(x_train_bow,y_train)
pred_y5=model5.predict(x_test_bow)
print(confusion_matrix(y_test,pred_y5))
pred_df4=pd.DataFrame({"Actual Value ":y_test,"Predicted Value":pred_y5})
print(pred_df4)
print('Precision:', sklearn.metrics.precision_score(y_test,pred_y5,average='weighted'))
print('recall:', sklearn.metrics.recall_score(y_test,pred_y5,average='weighted'))
print('f1_score:', sklearn.metrics.f1_score(y_test,pred_y5,average='weighted'))

[[4167  764]
 [ 728 4258]]
      Actual Value   Predicted Value
0                 1                1
1                 0                0
2                 0                0
3                 1                1
4                 1                1
...             ...              ...
9912              0                0
9913              0                0
9914              0                0
9915              0                0
9916              0                0

[9917 rows x 2 columns]
Precision: 0.8495636450317278
recall: 0.8495512755873752
f1_score: 0.8495462636187172
