In [1]:
# read data

import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
data = pd.read_csv("IMDB Dataset.csv",encoding='latin1')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [2]:
# clean text

# lowercase
data['sentiment'] = data['sentiment'].str.lower()

# change n't to not
data['sentiment'] = data['sentiment'].str.replace(r"\b\w+n't\b",'not')

# remove punctuation
data['sentiment'] = data['sentiment'].str.replace(r'[^\w\s]', ' ')

# remove \n
data['sentiment'] = data['sentiment'].str.replace("\n", "")

# remove <br />
data['sentiment'] = data['sentiment'].str.replace('<br />','')

data.head()

  import sys
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# remove stopword

stopword = stopwords.words('english')

cleaned_reviews = []

for line in data.review:
    words = line.split(' ')
    new_words = []
    for word in words:
        if word in stopword:
            continue
        new_words.append(word)
    cleaned_review = " ".join(new_words)
    cleaned_reviews.append(cleaned_review)

data['cleaned_review'] = cleaned_reviews


In [4]:
data.head()

Unnamed: 0,review,sentiment,cleaned_review
0,One of the other reviewers has mentioned that ...,positive,One reviewers mentioned watching 1 Oz episode ...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...,positive,I thought wonderful way spend time hot summer ...
3,Basically there's a family where a little boy ...,negative,Basically there's family little boy (Jake) thi...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love Time Money"" visually stu..."


In [5]:
# Lemmatization

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

lemm = []

for i in data.cleaned_review:
    word = i.split(' ')
    new = []
    
    for k in word:
        new.append(lemmatizer.lemmatize(k))
        
    lemm.append(" ".join(new))

data['lemmatized'] = lemm

data.head()

Unnamed: 0,review,sentiment,cleaned_review,lemmatized
0,One of the other reviewers has mentioned that ...,positive,One reviewers mentioned watching 1 Oz episode ...,One reviewer mentioned watching 1 Oz episode h...
1,A wonderful little production. <br /><br />The...,positive,A wonderful little production. <br /><br />The...,A wonderful little production. <br /><br />The...
2,I thought this was a wonderful way to spend ti...,positive,I thought wonderful way spend time hot summer ...,I thought wonderful way spend time hot summer ...
3,Basically there's a family where a little boy ...,negative,Basically there's family little boy (Jake) thi...,Basically there's family little boy (Jake) thi...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,"Petter Mattei's ""Love Time Money"" visually stu...","Petter Mattei's ""Love Time Money"" visually stu..."


In [6]:
# CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 1), stop_words="english", 
                             max_features=3000, token_pattern = '(?u)\\b[a-zA-Z][a-zA-Z]+\\b') 

X = vectorizer.fit_transform(data['lemmatized'])
vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
print(f"Shape of dataframe is {vectorized_df.shape}")
print(f"Total number of occurences: {vectorized_df.sum().sum()}")

y = data['sentiment'].values
X = X.toarray()
indices = np.arange(len(X))

Shape of dataframe is (50000, 3000)
Total number of occurences: 3993511


In [7]:
# logistic regression

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 1)
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 500)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)*100

87.33999999999999

In [8]:
#svm

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 1)

from sklearn.linear_model import SGDClassifier

svm=SGDClassifier(loss='hinge',max_iter=500,random_state=1)
svm.fit(X_train, y_train)
svm.score(X_test, y_test)*100

87.27000000000001

In [9]:
#naive bayes
from sklearn.naive_bayes import MultinomialNB

mnb=MultinomialNB()
mnb.fit(X_train, y_train)
mnb.score(X_test, y_test)*100

84.17

In [10]:
# TfidfVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tf = TfidfVectorizer(ngram_range=(1,1),
                             token_pattern='(?u)\\b[a-zA-Z][a-zA-Z]+\\b',
                             stop_words="english", max_features=3000) 

X = vectorizer_tf.fit_transform(data['lemmatized'])
vectorized_df = pd.DataFrame(X.toarray(), columns=vectorizer_tf.get_feature_names())
print(f"Shape of dataframe is {vectorized_df.shape}")
print(f"Total number of occurences: {vectorized_df.sum().sum()}")

y = data['sentiment'].values

Shape of dataframe is (50000, 3000)
Total number of occurences: 334718.05204056855


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 1)

In [12]:
# logistic regression

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter = 500)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)*100

88.06

In [13]:
#svm

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 1)

from sklearn.linear_model import SGDClassifier

svm=SGDClassifier(loss='hinge',max_iter=500,random_state=1)
svm.fit(X_train, y_train)
svm.score(X_test, y_test)*100

87.92

In [14]:
#naive bayes
from sklearn.naive_bayes import MultinomialNB

mnb=MultinomialNB()
mnb.fit(X_train, y_train)
mnb.score(X_test, y_test)*100

84.82