In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np
import re
df = pd.read_csv("IMDB_Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
# Let’s check the size of dataset.
df.shape

(50000, 2)

In [None]:
# There are 50000 samples(rows). The data is huge. Let’s take a sample for now. This will make our job easy and quicker.
# Subset
df = df.sample(1000)
# resetting index
df.reset_index(drop=True, inplace=True)
# sample dataset size
df.shape

(1000, 2)

In [None]:
# Let’s update target variables as binary values 0 and 1
# positive:1 , negative:0
df['sentiment'].replace({'positive':1, 'negative':0}, inplace=True)
df.head()

Unnamed: 0,review,sentiment
0,"So much for JUDGE AND JURY, which lives up to ...",0
1,I'm going to have to disagree with the previou...,0
2,This was the third Muppet movie and the last o...,1
3,"Inspired casting, charming and witty throughou...",1
4,This sitcom was a big crowd puller in the year...,1


In [None]:
# functions to remove noise
# remove html tags
def clean_html(text):
  clean = re.compile('<.*?>')
  return re.sub(clean, '', text)

In [None]:
# remove brackets
def remove_brackets(text):
  return re.sub('\[[^]]*\]', '', text)

In [None]:
# lower the cases
def lower_cases(text):
  return text.lower()

In [None]:
# remove special characters
def remove_char(text):
  pattern = r'[^a-zA-z0–9\s]'
  text = re.sub(pattern, '', text)
  return text

In [None]:
# remove noise(combine above functions)
def remove_noise(text):
  text = clean_html(text)
  text = remove_brackets(text)
  text = lower_cases(text)
  text = remove_char(text)
  return text

In [None]:
# call the function on predictors
df['review']=df['review'].apply(remove_noise)
df['review']

Unnamed: 0,review
0,so much for judge and jury which lives up to i...
1,im going to have to disagree with the previous...
2,this was the third muppet movie and the last o...
3,inspired casting charming and witty throughout...
4,this sitcom was a big crowd puller in the year...
...,...
995,watching the last episodes i remembered a tv ...
996,hitchcocks remake of his 9 film concerns about...
997,i sat down to watch midnight cowboy thinking i...
998,lucille ball cannot sing or act or dance this ...


In [None]:
from nltk.stem.porter import PorterStemmer
def stem_words(text):
  ps = PorterStemmer()
  stem_list = [ps.stem(word) for word in text.split()]
  text = ''.join(ps.stem(word) for word in text)

  return text
df['review'] = df['review'].apply(stem_words)

In [None]:
# importing from nlptoolkit library
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# creating list of english stopwords
stopword_list = stopwords.words('english')

# removing the stopwords from review
def remove_stopwords(text):
    # list to add filtered words from review
    filtered_text = []
    # verify & append words from the text to filtered_text list
    for word in text.split():
        if word not in stopword_list:
            filtered_text.append(word)
    # add content from filtered_text list to new variable
    clean_review = filtered_text[:]
    # emptying the filtered_text list for new review
    filtered_text.clear()
    return clean_review

# applying the function to the review column
df['review'] = df['review'].apply(remove_stopwords)
df['review']


Unnamed: 0,review
0,"[much, judge, jury, lives, nonsense, title, go..."
1,"[im, going, disagree, previous, comment, side,..."
2,"[third, muppet, movie, last, one, jim, henson,..."
3,"[inspired, casting, charming, witty, throughou..."
4,"[sitcom, big, crowd, puller, year, 99that, tim..."
...,...
995,"[watching, last, episodes, remembered, tv, add..."
996,"[hitchcocks, remake, 9, film, concerns, known,..."
997,"[sat, watch, midnight, cowboy, thinking, would..."
998,"[lucille, ball, cannot, sing, act, dance, make..."


In [None]:
# join back all words as single paragraph
def join_back(text):
  return ' '.join(text)
df['review'] = df['review'].apply(join_back)

In [None]:
# check if changes are applied
df.head()

Unnamed: 0,review,sentiment
0,much judge jury lives nonsense title good ligh...,0
1,im going disagree previous comment side maltin...,0
2,third muppet movie last one jim henson around ...,1
3,inspired casting charming witty throughout muc...,1
4,sitcom big crowd puller year 99that time peopl...,1


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=800)  # Corrected from CounterVectorizer to CountVectorizer


In [None]:
#vectorizing words and storing in variable X(predictor)
X = cv.fit_transform(df['review']).toarray()
#predictor
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 4, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:

#target
y = df.iloc[:, -1].values
y.shape


(1000,)

In [None]:
#train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Finally, let's fit the naive bayes classifier

In [None]:
from sklearn.naive_bayes import GaussianNB,   MultinomialNB, BernoulliNB


In [None]:
gnb= GaussianNB()
mnb = MultinomialNB()
bnb= BernoulliNB()

In [None]:
gnb.fit(X_train, y_train)
y_pred_gnb = gnb.predict(X_test)

In [None]:
mnb.fit(X_train, y_train)
y_pred_mnb = mnb.predict(X_test)

In [None]:
bnb.fit(X_train, y_train)
y_pred_bnb = bnb.predict(X_test)

In [None]:
#accuracy scores
from sklearn.metrics import accuracy_score
print("Gaussian", accuracy_score(y_test, y_pred_gnb))
print("Multinomial", accuracy_score(y_test, y_pred_mnb))
print("Bernoulli", accuracy_score(y_test, y_pred_bnb))


Gaussian 0.72
Multinomial 0.755
Bernoulli 0.795
