In [None]:
import pandas as pd
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, accuracy_score
from sklearn.metrics import classification_report

Data Exploration

In [None]:
data = pd.read_csv("/imdb.csv")
data.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [None]:
data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [None]:
data["sentiment"].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [None]:
#Checking for null values
null_values = data.isnull()
null_values.sum()

review       0
sentiment    0
dtype: int64

In [None]:
#Taking one review as an example

example_text = data["review"][4]
example_text

'Petter Mattei\'s "Love in the Time of Money" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situations we encounter. <br /><br />This being a variation on the Arthur Schnitzler\'s play about the same theme, the director transfers the action to the present time New York where all these different characters meet and connect. Each one is connected in one way, or another to the next person, but no one seems to know the previous point of contact. Stylishly, the film has a sophisticated luxurious look. We are taken to see how these people live and the world they live in their own habitat.<br /><br />The only thing one gets out of all these souls in the picture is the different stages of loneliness each one inhabits. A big city is not exactly the best place in which human relations find sincere fulfillment, as one discerns is the case wit

Data preparation
(Cleaning and stemming reviews)

In [None]:
#Function for removing strips and characters
def clean_text(text):
    #removing strips
    soup = BeautifulSoup(text, "html.parser")
    new_text = soup.get_text()
    #removing characters
    pattern=r'[^a-zA-z0-9\s]'
    new_text = re.sub(pattern,'',new_text)
    new_text = new_text.lower()
    return new_text

In [None]:
ps = PorterStemmer()
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
#Function for tokenization and stemming of the text
def tokenize_stem(text):
    tokens = word_tokenize(text)

    # Get the English stopwords from NLTK
    stopword_list = stopwords.words('english')

    # Remove stopwords and make stemming of text
    new_text = [ps.stem(word) for word in tokens if word not in stopword_list]

    return ' '.join(new_text)

In [None]:
#Lets check our functions with an example review
new_text = clean_text(example_text)

In [None]:
new_text = tokenize_stem(new_text)
new_text

'petter mattei love time money visual stun film watch mr mattei offer us vivid portrait human relat movi seem tell us money power success peopl differ situat encount variat arthur schnitzler play theme director transfer action present time new york differ charact meet connect one connect one way anoth next person one seem know previou point contact stylishli film sophist luxuri look taken see peopl live world live habitatth thing one get soul pictur differ stage loneli one inhabit big citi exactli best place human relat find sincer fulfil one discern case peopl encounterth act good mr mattei direct steve buscemi rosario dawson carol kane michael imperioli adrian grenier rest talent cast make charact come alivew wish mr mattei good luck await anxious next work'

Applying cleaning and stemming functions to all reviews

In [None]:
reviews = data["review"]
#Save all cleaned reviews in the list below
cleaned_reviews = []

for i in range(len(reviews)):
  review = clean_text(reviews[i])
  review = tokenize_stem(review)
  cleaned_reviews.append(review)

  soup = BeautifulSoup(text, "html.parser")


Using Bag of Words

In [None]:
# Creating a bag of words model
cv = CountVectorizer(max_features=2500)
x = cv.fit_transform(cleaned_reviews).toarray()

y = pd.get_dummies(data['sentiment'])
y = y.iloc[:,1].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

Random Forest Classifier


In [None]:
#Using random forest algorithm from sklearn
rf_classifier = RandomForestClassifier().fit(X_train,y_train)
y_prediction_1 = rf_classifier.predict(X_test)

In [None]:
rf_score = accuracy_score(y_test,y_prediction_1)
print(rf_score)
print(classification_report(y_test,y_prediction_1))

0.8404
              precision    recall  f1-score   support

           0       0.84      0.85      0.84      5035
           1       0.84      0.83      0.84      4965

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000



Naive Bayes Classifier


In [None]:
#Using Naive Bayes Algorithm from sklearn
nb_classifier = MultinomialNB().fit(X_train,y_train)
y_prediction_2 = nb_classifier.predict(X_test)

In [None]:
nb_score = accuracy_score(y_test,y_prediction_2)
print(nb_score)
print(classification_report(y_test,y_prediction_2))

0.8346
              precision    recall  f1-score   support

           0       0.83      0.84      0.84      5035
           1       0.84      0.83      0.83      4965

    accuracy                           0.83     10000
   macro avg       0.83      0.83      0.83     10000
weighted avg       0.83      0.83      0.83     10000



Logistic Regression


In [None]:
#Using Logistic Regression from sklearn
lr = LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
lr_classifier = lr.fit(X_train,y_train)
y_prediction_3 = lr_classifier.predict(X_test)

In [None]:
lr_score = accuracy_score(y_test,y_prediction_3)
print(lr_score)
print(classification_report(y_test,y_prediction_3))

0.8709
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      5035
           1       0.87      0.87      0.87      4965

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



Logistic Regression has highest precision level of around ~88%

Random Forest and Naive Bayes have ~84% and ~83% respectively