In [1]:
import numpy as np 
import pandas as pd
import re 
import nltk
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem.porter import *
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from bs4 import BeautifulSoup
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from wordcloud import WordCloud, STOPWORDS
import pickle

In [2]:
df = pd.read_csv("/kaggle/input/imdb-dataset/IMDB Dataset.csv")

In [3]:
df.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


#### seeing if dataset is balanced

In [4]:
df.groupby("sentiment").count()

Unnamed: 0_level_0,review
sentiment,Unnamed: 1_level_1
negative,25000
positive,25000


#### encoding sentiment values

In [5]:
df["sentiment_encoding"] = df["sentiment"].apply(lambda x: 1 if x=="positive" else 0)
df.head(3)

Unnamed: 0,review,sentiment,sentiment_encoding
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1


#### splitting dataset

In [6]:
train_dataset = df.sample(frac=0.8,random_state=0)
test_dataset = df.drop(train_dataset.index)

train_reviews = train_dataset.review
train_sentis = train_dataset.sentiment_encoding

test_reviews = test_dataset.review
test_sentis = test_dataset.sentiment_encoding

print(train_reviews.shape, train_sentis.shape)
print(test_reviews.shape, test_sentis.shape)

(40000,) (40000,)
(10000,) (10000,)


#### text preprocessing

In [7]:
tokenizer = ToktokTokenizer()

# remove html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

# keep only alphabetic words
def alpha_text(text):
    return re.sub('^[a-z]+$', '', text)

def clean_words(text):
    text = alpha_text(strip_html(text))
    return text

df["review"] = df["review"].apply(clean_words)



### removing stopwords, stemming

In [8]:
stop_list = stopwords.words("english")

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stop_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stop_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

df["review"] = df["review"].apply(remove_stopwords)

In [9]:
df["review"]

0        One reviewers mentioned watching 1 Oz episode ...
1        wonderful little production. filming technique...
2        thought wonderful way spend time hot summer we...
3        Basically ' family little boy ( Jake ) thinks ...
4        Petter Mattei ' " Love Time Money " visually s...
                               ...                        
49995    thought movie right good job. ' creative origi...
49996    Bad plot , bad dialogue , bad acting , idiotic...
49997    Catholic taught parochial elementary schools n...
49998    ' going disagree previous comment side Maltin ...
49999    one expects Star Trek movies high art , fans e...
Name: review, Length: 50000, dtype: object

In [10]:
def stemmer(text):
    stemmer = PorterStemmer()
    doc_stemmed = [stemmer.stem(w) for w in text.split()]
    return doc_stemmed

df["review"] = df["review"].apply(stemmer)

#### creating bags of words model

In [11]:
#Count vectorizer for bag of words
cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews=cv.fit_transform(train_reviews)
#transformed test reviews
cv_test_reviews=cv.transform(test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)


BOW_cv_train: (40000, 6193147)
BOW_cv_test: (10000, 6193147)


#### creating TFIDF matrix

In [12]:
#Tfidf vectorizer
tv=TfidfVectorizer(min_df=0,max_df=1,use_idf=True,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews=tv.fit_transform(train_reviews)
#transformed test reviews
tv_test_reviews=tv.transform(test_reviews)
print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (40000, 6193147)
Tfidf_test: (10000, 6193147)


#### building logistic regression model

In [13]:
#training the model
lr=LogisticRegression(penalty='l2',max_iter=500,C=1,random_state=42)
#Fitting the model for Bag of words
lr_bow=lr.fit(cv_train_reviews,train_sentis)

#Fitting the model for tfidf features
lr_tfidf=lr.fit(tv_train_reviews,train_sentis)

In [14]:
#Predicting the model for bag of words
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)
##Predicting the model for tfidf features
lr_tfidf_predict=lr.predict(tv_test_reviews)
print(lr_tfidf_predict)

[0 1 0 ... 1 0 0]
[0 1 0 ... 1 0 0]


#### evaluating the model

In [15]:
#Classification report for bag of words 
lr_bow_report=classification_report(test_sentis,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)

#Classification report for tfidf features
lr_tfidf_report=classification_report(test_sentis,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

              precision    recall  f1-score   support

    Positive       0.77      0.76      0.77      5013
    Negative       0.77      0.77      0.77      4987

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000

              precision    recall  f1-score   support

    Positive       0.77      0.77      0.77      5013
    Negative       0.77      0.77      0.77      4987

    accuracy                           0.77     10000
   macro avg       0.77      0.77      0.77     10000
weighted avg       0.77      0.77      0.77     10000

