In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
import sklearn
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
%matplotlib inline

In [2]:
data = pd.read_csv("data/IMDB Dataset.csv")
print (data.shape)
data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [3]:
data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [4]:
data.sentiment.value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [5]:
def remove_html(text):
    bs = BeautifulSoup(text, "html.parser")
    return ' ' + bs.get_text() + ' '

def keep_only_letters(text):
    text=re.sub(r'[^a-zA-Z\s]','',text)
    return text

def convert_to_lowercase(text):
    return text.lower()

def clean_reviews(text):
    text = remove_html(text)
    text = keep_only_letters(text)
    text = convert_to_lowercase(text)
    return text

In [6]:
data['review'] = data['review'].apply(lambda review: clean_reviews(review))

In [7]:
english_stop_words = nltk.corpus.stopwords.words('english')
print(len(english_stop_words))
print (english_stop_words[:20])

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [8]:
def remove_stop_words(text):
    for stopword in english_stop_words:
        stopword = ' ' + stopword + ' '
        text = text.replace(stopword, ' ')
    return text

data['review'] = data['review'].apply(remove_stop_words)

In [9]:
def text_stemming(text):
    stemmer = nltk.porter.PorterStemmer()
    stemmed = ' '.join([stemmer.stem(token) for token in text.split()])
    return stemmed

data['review'] = data['review'].apply(text_stemming)

In [10]:
data.head(10)

Unnamed: 0,review,sentiment
0,one review mention watch oz episod youll hook ...,positive
1,wonder littl product film techniqu unassum old...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic there famili littl boy jake think there ...,negative
4,petter mattei love time money visual stun film...,positive
5,probabl alltim favorit movi stori selfless sac...,positive
6,sure would like see resurrect date seahunt ser...,positive
7,show amaz fresh innov idea first air first yea...,negative
8,encourag posit comment film look forward watch...,negative
9,like origin gut wrench laughter like movi youn...,positive


## Splitting into Training and Test data

In [11]:
imdb_train = data[:40000]
imdb_test = data[40000:]

## Term Frequency Features

In [12]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,1))
tf_features_train = vectorizer.fit_transform(imdb_train['review'])
tf_features_test = vectorizer.transform(imdb_test['review'])
print (tf_features_train.shape, tf_features_test.shape)

(40000, 150374) (10000, 150374)


## Output Encoding

In [13]:
train_labels = [1 if sentiment=='positive' else 0 for sentiment in imdb_train['sentiment']]
test_labels = [1 if sentiment=='positive' else 0 for sentiment in imdb_test['sentiment']]
print (len(train_labels), len(test_labels))

40000 10000


## Logistic Regression Classifier

In [14]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(tf_features_train, train_labels)
print (clf)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


## Results

In [19]:
predictions = clf.predict(tf_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1]))

              precision    recall  f1-score   support

    Negative       0.88      0.88      0.88      4993
    Positive       0.88      0.88      0.88      5007

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

[[4398  595]
 [ 581 4426]]


## Bigram features

In [20]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,2))
tf_features_train = vectorizer.fit_transform(imdb_train['review'])
tf_features_test = vectorizer.transform(imdb_test['review'])
print (tf_features_train.shape, tf_features_test.shape)

clf = sklearn.linear_model.LogisticRegression()
clf.fit(tf_features_train, train_labels)

predictions = clf.predict(tf_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1]))

(40000, 2494028) (10000, 2494028)




              precision    recall  f1-score   support

    Negative       0.90      0.89      0.90      4993
    Positive       0.89      0.90      0.90      5007

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

[[4457  536]
 [ 496 4511]]


In [21]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,3))
tf_features_train = vectorizer.fit_transform(imdb_train['review'])
tf_features_test = vectorizer.transform(imdb_test['review'])
print (tf_features_train.shape, tf_features_test.shape)

clf = sklearn.linear_model.LogisticRegression()
clf.fit(tf_features_train, train_labels)

predictions = clf.predict(tf_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1]))

(40000, 6802553) (10000, 6802553)




              precision    recall  f1-score   support

    Negative       0.90      0.89      0.90      4993
    Positive       0.89      0.90      0.90      5007

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

[[4452  541]
 [ 500 4507]]


## MultiNomial Naive Bayes

In [16]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,3))
tf_features_train = vectorizer.fit_transform(imdb_train['review'])
tf_features_test = vectorizer.transform(imdb_test['review'])
print (tf_features_train.shape, tf_features_test.shape)

clf = MultinomialNB()
clf.fit(tf_features_train, train_labels)

predictions = clf.predict(tf_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1]))

(40000, 6802553) (10000, 6802553)
              precision    recall  f1-score   support

    Negative       0.88      0.89      0.89      4993
    Positive       0.89      0.88      0.88      5007

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000

[[4456  537]
 [ 614 4393]]


# LSVM 

In [24]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,3))
tf_features_train = vectorizer.fit_transform(imdb_train['review'])
tf_features_test = vectorizer.transform(imdb_test['review'])
print (tf_features_train.shape, tf_features_test.shape)

clf = sklearn.svm.LinearSVC()
clf.fit(tf_features_train, train_labels)

predictions = clf.predict(tf_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1]))

(40000, 6802553) (10000, 6802553)
              precision    recall  f1-score   support

    Negative       0.90      0.89      0.89      4993
    Positive       0.89      0.90      0.90      5007

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

[[4459  534]
 [ 513 4494]]




## TF-IDF feature Creation

In [26]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=True,ngram_range=(1,1))
tfidf_features_train = vectorizer.fit_transform(imdb_train['review'])
tfidf_features_test = vectorizer.transform(imdb_test['review'])
print (tfidf_features_train.shape, tfidf_features_test.shape)

(40000, 150374) (10000, 150374)


# On TFIDF features

## LR

In [30]:
#Create features
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=True,ngram_range=(2,3))
tfidf_features_train = vectorizer.fit_transform(imdb_train['review'])
tfidf_features_test = vectorizer.transform(imdb_test['review'])
print (tfidf_features_train.shape, tfidf_features_test.shape)

#train model
clf = sklearn.linear_model.LogisticRegression()
clf.fit(tfidf_features_train, train_labels)

#evaluation
predictions = clf.predict(tfidf_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1]))

(40000, 6652179) (10000, 6652179)




              precision    recall  f1-score   support

    Negative       0.86      0.85      0.86      4993
    Positive       0.85      0.87      0.86      5007

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000

[[4247  746]
 [ 666 4341]]


## LSVM

In [33]:
#Create features
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=True,ngram_range=(1,3))
tfidf_features_train = vectorizer.fit_transform(imdb_train['review'])
tfidf_features_test = vectorizer.transform(imdb_test['review'])
print (tfidf_features_train.shape, tfidf_features_test.shape)

#train model
clf = sklearn.svm.LinearSVC()
clf.fit(tfidf_features_train, train_labels)

#evaluation
predictions = clf.predict(tfidf_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1]))

(40000, 6802553) (10000, 6802553)
              precision    recall  f1-score   support

    Negative       0.90      0.89      0.90      4993
    Positive       0.89      0.91      0.90      5007

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000

[[4444  549]
 [ 468 4539]]


## MNB

In [36]:
#Create features
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=True,ngram_range=(1,3))
tfidf_features_train = vectorizer.fit_transform(imdb_train['review'])
tfidf_features_test = vectorizer.transform(imdb_test['review'])
print (tfidf_features_train.shape, tfidf_features_test.shape)

#train model
clf = sklearn.naive_bayes.MultinomialNB()
clf.fit(tfidf_features_train, train_labels)

#evaluation
predictions = clf.predict(tfidf_features_test)
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))
print(sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1]))

(40000, 6802553) (10000, 6802553)
              precision    recall  f1-score   support

    Negative       0.88      0.89      0.89      4993
    Positive       0.89      0.88      0.89      5007

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

[[4464  529]
 [ 604 4403]]
