In [64]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import nltk
import sklearn
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
%matplotlib inline

In [51]:
imdb_data = pd.read_csv("data/IMDB Dataset.csv")
print (data.shape)
data.head(10)

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [52]:
imdb_data.isnull().sum()

review       0
sentiment    0
dtype: int64

In [53]:
imdb_data.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [54]:
def remove_html(text):
    bs = BeautifulSoup(text, "html.parser")
    return ' ' + bs.get_text() + ' '

def keep_only_letters(text):
    text=re.sub(r'[^a-zA-Z\s]','',text)
    return text

def convert_to_lowercase(text):
    return text.lower()

def clean_reviews(text):
    text = remove_html(text)
    text = keep_only_letters(text)
    text = convert_to_lowercase(text)
    return text

In [55]:
imdb_data['review'] = imdb_data['review'].apply(lambda review: clean_reviews(review))

In [56]:
english_stop_words = nltk.corpus.stopwords.words('english')
print(len(english_stop_words))
english_stop_words[:20]

179


['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [57]:
def remove_stop_words(text):
    for stopword in english_stop_words:
        stopword = ' ' + stopword + ' '
        text = text.replace(stopword, ' ')
    return text

imdb_data['review'] = imdb_data['review'].apply(remove_stop_words)

In [61]:
def text_stemming(text):
    stemmer = nltk.porter.PorterStemmer()
    stemmed = ' '.join([stemmer.stem(token) for token in text.split()])
    return stemmed

imdb_data['review'] = imdb_data['review'].apply(text_stemming)

In [63]:
imdb_train = imdb_data[:40000]
imdb_test = imdb_data[40000:]

In [95]:
vectorizer = sklearn.feature_extraction.text.CountVectorizer(binary=False,ngram_range=(1,3))
tf_features_train = vectorizer.fit_transform(imdb_train['review'])
tf_features_test = vectorizer.transform(imdb_test['review'])
print (tf_features_train.shape, tf_features_test.shape)

(40000, 6802553) (10000, 6802553)


In [96]:
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(use_idf=True,ngram_range=(1,3))
tfidf_features_train = vectorizer.fit_transform(imdb_train['review'])
tfidf_features_test = vectorizer.transform(imdb_test['review'])
print (tf_features_train.shape, tf_features_test.shape)

(40000, 6802553) (10000, 6802553)


In [97]:
train_labels = [1 if sentiment=='positive' else 0 for sentiment in imdb_train['sentiment']]
test_labels = [1 if sentiment=='positive' else 0 for sentiment in imdb_test['sentiment']]
print (len(train_labels), len(test_labels))

40000 10000


In [98]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(tf_features_train, train_labels)
print (clf)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [99]:
predictions = clf.predict(tf_features_test)

In [100]:
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.90      0.89      0.90      4993
    Positive       0.89      0.90      0.90      5007

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [101]:
clf = sklearn.linear_model.LogisticRegression()
clf.fit(tfidf_features_train, train_labels)
print (clf)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [102]:
predictions = clf.predict(tf_features_test)

In [103]:
print(sklearn.metrics.classification_report(test_labels, predictions, target_names=['Negative', 'Positive']))

              precision    recall  f1-score   support

    Negative       0.86      0.86      0.86      4993
    Positive       0.86      0.86      0.86      5007

    accuracy                           0.86     10000
   macro avg       0.86      0.86      0.86     10000
weighted avg       0.86      0.86      0.86     10000



In [107]:
sklearn.metrics.confusion_matrix(test_labels, predictions, labels=[0, 1])

array([[4285,  708],
       [ 701, 4306]])

In [108]:
imdb_data.head(10)

Unnamed: 0,review,sentiment
0,one review mention watch oz episod youll hook ...,positive
1,wonder littl product film techniqu unassum old...,positive
2,thought wonder way spend time hot summer weeke...,positive
3,basic there famili littl boy jake think there ...,negative
4,petter mattei love time money visual stun film...,positive
5,probabl alltim favorit movi stori selfless sac...,positive
6,sure would like see resurrect date seahunt ser...,positive
7,show amaz fresh innov idea first air first yea...,negative
8,encourag posit comment film look forward watch...,negative
9,like origin gut wrench laughter like movi youn...,positive
