# Importing libraries

In [64]:
# importing the libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split as tts

In [65]:
df = pd.read_csv('Imdb_dataset.csv',delimiter='\t')
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
24994,I have seen this movie at the cinema many year...,negative
24995,This movie was a real torture fest to sit thro...,negative
24996,John Wayne & Albert Dekker compete for oil rig...,negative
24997,Tarantino once remarked on a melodrama from th...,positive


In [66]:
df.shape

(24999, 2)

In [67]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [68]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [69]:
df.dtypes

review       object
sentiment    object
dtype: object

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24999 entries, 0 to 24998
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     24999 non-null  object
 1   sentiment  24999 non-null  object
dtypes: object(2)
memory usage: 390.7+ KB


In [71]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [72]:
df.sentiment.value_counts()

negative    12525
positive    12474
Name: sentiment, dtype: int64

In [73]:
df.sentiment.replace('positive',1,inplace=True)
df.sentiment.replace('negative',0,inplace=True)

In [74]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


# Data Preprocessing

In [75]:
df.describe()

Unnamed: 0,sentiment
count,24999.0
mean,0.49898
std,0.500009
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


# Defining the model

In [76]:
def clean_text1(text):
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\w*\d\w*','',text)
    return text

cleaned1=lambda x:clean_text1(x)

In [77]:
df['review']=pd.DataFrame(df.review.apply(cleaned1))

In [79]:
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the filmin...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


In [80]:
# second round of cleaning
def clean_text2(text):
    text=re.sub('[''"",,,]','',text)
    text=re.sub('\n','',text)
    return text

cleaned2=lambda x:clean_text2(x)

In [81]:
df['review']=pd.DataFrame(df.review.apply(cleaned2))
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,1
1,a wonderful little production br br the filmin...,1
2,i thought this was a wonderful way to spend ti...,1
3,basically theres a family where a little boy j...,0
4,petter matteis love in the time of money is a ...,1


# Splitting the Data¶

In [82]:
x = df.iloc[0:,0].values
y = df.iloc[0:,1].values

In [83]:
xtrain,xtest,ytrain,ytest = tts(x,y,test_size = 0.25,random_state = 225)

# Extracting the features

In [84]:
tf = TfidfVectorizer()
from sklearn.pipeline import Pipeline

# Building the Model

In [85]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
model=Pipeline([('vectorizer',tf),('classifier',classifier)])

model.fit(xtrain,ytrain)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', LogisticRegression())])

In [86]:

ypred=model.predict(xtest)

In [87]:
# model score
accuracy_score(ypred,ytest)

0.88448

In [88]:
# confusion matrix
A=confusion_matrix(ytest,ypred)
print(A)

[[2735  374]
 [ 348 2793]]


In [89]:
# f1 score
recall=A[0][0]/(A[0][0]+A[1][0])
precision=A[0][0]/(A[0][0]+A[0][1])
F1=2*recall*precision/(recall+precision)
print(F1)

0.8833979328165376
