In [16]:
# importing the libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import string
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split as tts

In [19]:
df = pd.read_csv('IMDB-Dataset2.csv',delimiter='\t')
df

Unnamed: 0,review,sentiment
0,This film should have never been made. Honestl...,negative
1,This movie was bad from the start. The only pu...,negative
2,"God, I never felt so insulted in my whole life...",negative
3,Not being a fan of the Coen Brothers or George...,positive
4,The movie Andaz Apna Apna in my books is the t...,positive
...,...,...
24996,I thought this movie did a down right good job...,positive
24997,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
24998,I am a Catholic taught in parochial elementary...,negative
24999,I'm going to have to disagree with the previou...,negative


In [20]:
df.shape

(25001, 2)

In [21]:
df.head()

Unnamed: 0,review,sentiment
0,This film should have never been made. Honestl...,negative
1,This movie was bad from the start. The only pu...,negative
2,"God, I never felt so insulted in my whole life...",negative
3,Not being a fan of the Coen Brothers or George...,positive
4,The movie Andaz Apna Apna in my books is the t...,positive


In [22]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [23]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [24]:
df.dtypes

review       object
sentiment    object
dtype: object

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25001 entries, 0 to 25000
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     25001 non-null  object
 1   sentiment  25001 non-null  object
dtypes: object(2)
memory usage: 390.8+ KB


In [26]:
df.sentiment.value_counts()

positive    12526
negative    12475
Name: sentiment, dtype: int64

In [27]:
df.sentiment.replace('positive',1,inplace=True)
df.sentiment.replace('negative',0,inplace=True)

In [28]:
df.head()

Unnamed: 0,review,sentiment
0,This film should have never been made. Honestl...,0
1,This movie was bad from the start. The only pu...,0
2,"God, I never felt so insulted in my whole life...",0
3,Not being a fan of the Coen Brothers or George...,1
4,The movie Andaz Apna Apna in my books is the t...,1


# Data Preprocessing

In [29]:
df.describe()

Unnamed: 0,sentiment
count,25001.0
mean,0.50102
std,0.500009
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


# Defining the model

In [30]:
def clean_text1(text):
    text=text.lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('[%s]'%re.escape(string.punctuation),'',text)
    text=re.sub('\w*\d\w*','',text)
    return text

cleaned1=lambda x:clean_text1(x)

In [31]:
df['review']=pd.DataFrame(df.review.apply(cleaned1))

In [32]:
df.head()

Unnamed: 0,review,sentiment
0,this film should have never been made honestly...,0
1,this movie was bad from the start the only pur...,0
2,god i never felt so insulted in my whole life ...,0
3,not being a fan of the coen brothers or george...,1
4,the movie andaz apna apna in my books is the t...,1


In [33]:
# second round of cleaning
def clean_text2(text):
    text=re.sub('[''"",,,]','',text)
    text=re.sub('\n','',text)
    return text

cleaned2=lambda x:clean_text2(x)

In [34]:
df['review']=pd.DataFrame(df.review.apply(cleaned2))
df.head()

Unnamed: 0,review,sentiment
0,this film should have never been made honestly...,0
1,this movie was bad from the start the only pur...,0
2,god i never felt so insulted in my whole life ...,0
3,not being a fan of the coen brothers or george...,1
4,the movie andaz apna apna in my books is the t...,1


# Splitting the Data

In [35]:
x = df.iloc[0:,0].values
y = df.iloc[0:,1].values

In [36]:
xtrain,xtest,ytrain,ytest = tts(x,y,test_size = 0.25,random_state = 225)

# Extracting the features

In [37]:
tf = TfidfVectorizer()
from sklearn.pipeline import Pipeline

# Building the Model

In [38]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
model=Pipeline([('vectorizer',tf),('classifier',classifier)])

model.fit(xtrain,ytrain)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier', LogisticRegression())])

In [39]:
ypred=model.predict(xtest)

In [40]:
# model score
accuracy_score(ypred,ytest)

0.8884978403455447

In [41]:
# confusion matrix
A=confusion_matrix(ytest,ypred)
print(A)

[[2755  422]
 [ 275 2799]]


In [42]:
# f1 score
recall=A[0][0]/(A[0][0]+A[1][0])
precision=A[0][0]/(A[0][0]+A[0][1])
F1=2*recall*precision/(recall+precision)
print(F1)

0.8877074270984373
