In [18]:
# Must libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

In [19]:
# Import data
imdb_data=pd.read_csv('../../data/IMDB_reviews.csv')

# print(imdb_data.shape,'\n')
# print(imdb_data.head(10),'\n')
# print(imdb_data['sentiment'].value_counts())

### Step 1 - Preprocessing Text to tokens

In [20]:
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.stem.wordnet import WordNetLemmatizer

In [21]:
%time
# Pre-processing text data

def preprocessing(text):
    soup = BeautifulSoup(text, "html.parser")
    text = soup.get_text()     # Step 1 - Remove html tags
    pattern = r'[^a-zA-z0-9\s]' 
    text = re.sub(pattern,'',text)   # Step 2 - Remove punctuation
    text = text.lower()     # Step 3 - Make text lowercase
    tokenizer=ToktokTokenizer()
    tokens = tokenizer.tokenize(text)    # Step 4 - Tokenizing
    lemma = WordNetLemmatizer()          # Step 5 - Lemmatizing
    stopword_list=nltk.corpus.stopwords.words('english')
    stop=set(stopwords.words('english'))
    filtered_tokens = [lemma.lemmatize(word).strip() for word in tokens if word not in stopword_list]    # Step 6 - Removing stopwords
    text = TreebankWordDetokenizer().detokenize(filtered_tokens)
    return text

#Apply function on review column
imdb_data['review'] = imdb_data['review'].apply(preprocessing)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 11 µs


In [23]:
imdb_data.to_csv('IMDB_reviews_preprocessed.csv',index=False)

### Step 2 - Tokens to features and train-test split

In [27]:
imdb_data = pd.read_csv('IMDB_reviews_preprocessed.csv')
imdb_data.head()

Unnamed: 0,review,sentiment
0,one reviewer mentioned watching 1 oz episode y...,positive
1,wonderful little production filming technique ...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there family little boy jake think t...,negative
4,petter matteis love time money visually stunni...,positive


In [6]:
norm_train_reviews=imdb_data.review[:40000]
norm_test_reviews=imdb_data.review[40000:]

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

def dummy(doc):
    return doc

cv = CountVectorizer(
        tokenizer=dummy,
        preprocessor=dummy
    )  
# #Count vectorizer for bag of words
# cv=CountVectorizer(tokenizer=None,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(norm_train_reviews)
cv_test_reviews=cv.transform(norm_test_reviews)

In [16]:
#labeling the sentiment data
lb=LabelBinarizer()
#transformed sentiment data
sentiment_data=lb.fit_transform(imdb_data['sentiment'])
print(sentiment_data.shape)
#Spliting the sentiment data
train_sentiments=sentiment_data[:40000]
test_sentiments=sentiment_data[40000:]

(50000, 1)


### Step 3 - Fit and predict

In [17]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,train_sentiments)
print(lr_bow)

LogisticRegression(C=1, max_iter=500, random_state=42)


In [21]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)
# ##Predicting the model for tfidf features
# lr_tfidf_predict=lr.predict(tv_test_reviews)
# print(lr_tfidf_predict)

[0 0 1 ... 0 1 0]


### Step 4 - Metrics Evaluation

In [22]:
#Accuracy score for bag of words
lr_bow_score=accuracy_score(test_sentiments,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)

# #Accuracy score for tfidf features
# lr_tfidf_score=accuracy_score(test_sentiments,lr_tfidf_predict)
# print("lr_tfidf_score :",lr_tfidf_score)

lr_bow_score : 0.8851


In [23]:
#Classification report for bag of words 
lr_bow_report=classification_report(test_sentiments,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

# #Classification report for tfidf features
# lr_tfidf_report=classification_report(test_sentiments,lr_tfidf_predict,target_names=['Positive','Negative'])
# print(lr_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.89      0.88      0.88      4993
    Negative       0.88      0.89      0.89      5007

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [24]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,lr_bow_predict,labels=[1,0])
print(cm_bow)

# #confusion matrix for tfidf features
# cm_tfidf=confusion_matrix(test_sentiments,lr_tfidf_predict,labels=[1,0])
# print(cm_tfidf)

[[4442  565]
 [ 584 4409]]


In [25]:
#training the model
mnb=MultinomialNB()
#fitting the svm for bag of words
mnb_bow=mnb.fit(cv_train_reviews,train_sentiments)
print(mnb_bow)
# #fitting the svm for tfidf features
# mnb_tfidf=mnb.fit(tv_train_reviews,train_sentiments)
# print(mnb_tfidf)

MultinomialNB()


In [26]:
#Predicting the model for bag of words
mnb_bow_predict=mnb.predict(cv_test_reviews)
print(mnb_bow_predict)
# #Predicting the model for tfidf features
# mnb_tfidf_predict=mnb.predict(tv_test_reviews)
# print(mnb_tfidf_predict)

[0 0 0 ... 1 1 0]


In [27]:
#Accuracy score for bag of words
mnb_bow_score=accuracy_score(test_sentiments,mnb_bow_predict)
print("mnb_bow_score :",mnb_bow_score)
# #Accuracy score for tfidf features
# mnb_tfidf_score=accuracy_score(test_sentiments,mnb_tfidf_predict)
# print("mnb_tfidf_score :",mnb_tfidf_score)

mnb_bow_score : 0.8568


In [28]:
#Classification report for bag of words 
mnb_bow_report=classification_report(test_sentiments,mnb_bow_predict,target_names=['Positive','Negative'])
print(mnb_bow_report)
# #Classification report for tfidf features
# mnb_tfidf_report=classification_report(test_sentiments,mnb_tfidf_predict,target_names=['Positive','Negative'])
# print(mnb_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.84      0.88      0.86      4993
    Negative       0.87      0.84      0.85      5007

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [29]:
#confusion matrix for bag of words
cm_bow=confusion_matrix(test_sentiments,mnb_bow_predict,labels=[1,0])
print(cm_bow)
# #confusion matrix for tfidf features
# cm_tfidf=confusion_matrix(test_sentiments,mnb_tfidf_predict,labels=[1,0])
# print(cm_tfidf)

[[4192  815]
 [ 617 4376]]
