In [1]:
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
import re
from nltk.stem.porter import PorterStemmer
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.linear_model import LogisticRegression

In [2]:
imdb_data = pd.read_csv("IMDB_Dataset[1].csv")
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
imdb_data.describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [4]:
y = np.zeros((len(imdb_data["sentiment"]),))
    
for i in range(len(imdb_data["sentiment"])):
    if imdb_data["sentiment"][i] == "positive" : y[i] = 1
    else: y[i] = 0

print(imdb_data["sentiment"].head(15))
y[ :15]

0     positive
1     positive
2     positive
3     negative
4     positive
5     positive
6     positive
7     negative
8     negative
9     positive
10    negative
11    negative
12    negative
13    negative
14    positive
Name: sentiment, dtype: object


array([1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1.])

In [5]:
X = imdb_data["review"]
X_train = X[ :40000].reset_index()["review"]
X_test = X[40000 :].reset_index()["review"]
y_train = y[ :40000]
y_test = y[40000: ]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(40000,) (40000,)
(10000,) (10000,)


In [6]:
#sentiment count
imdb_data['sentiment'].value_counts()

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [7]:
def process_review(review):

    #Removing the html strips
    soup = BeautifulSoup(review, "html.parser")
    review = soup.get_text()
    review = re.sub('\[[^]]*\]', '', review)

    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # remove 
    pattern = r'[^a-zA-z0-9\s]'
    review = re.sub(pattern,'', review)

    # tokenize reviews
    tokenizer = ToktokTokenizer()
    review_tokens = tokenizer.tokenize(review)

    review_clean = []
    for word in review_tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            
            stem_word = stemmer.stem(word)  # stemming word
            review_clean.append(stem_word)

    return review_clean

In [8]:
def build_freqs(reviews, ys):
    
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, review in zip(yslist, reviews):
        for word in process_review(review):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs


In [9]:
freqs = build_freqs(X_train, y_train)

print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

type(freqs) = <class 'dict'>
len(freqs) = 193363


In [10]:
print('\nThis is an example of the processed version of the review: \n', process_review(X_train[5]))


This is an example of the processed version of the review: 
 ['probabl', 'alltim', 'favorit', 'movi', 'stori', 'selfless', 'sacrific', 'dedic', 'nobl', 'caus', 'preachi', 'bore', 'It', 'never', 'get', 'old', 'despit', 'seen', '15', 'time', 'last', '25', 'year', 'paul', 'luka', 'perform', 'bring', 'tear', 'eye', 'bett', 'davi', 'one', 'truli', 'sympathet', 'role', 'delight', 'the', 'kid', 'grandma', 'say', 'like', 'dressedup', 'midget', 'children', 'make', 'fun', 'watch', 'and', 'mother', 'slow', 'awaken', 'what', 'happen', 'world', 'roof', 'believ', 'startl', 'If', 'I', 'dozen', 'thumb', 'theyd', 'movi']


In [11]:
def extract_features(review, freqs):
    
    # process_review tokenizes, stems, and removes stopwords
    word_l = process_review(review)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
    
    # loop through each word in the list of words
    for word in word_l:
        
        # increment the word count for the positive label 1
        x[0,1] += freqs.get((word, 1), 0)
        
        # increment the word count for the negative label 0
        x[0,2] += freqs.get((word, 0), 0)
        
    assert(x.shape == (1, 3))
    return x

In [12]:
X_train_fin = np.zeros((len(X_train), 3))

for i in range(len(X_train)):
    X_train_fin[i, :]= extract_features(X_train[i], freqs)

clf = LogisticRegression().fit(X_train_fin, y_train)


In [13]:
clf

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
X_test_fin = np.zeros((len(X_test), 3))

for i in range(len(X_test)):
    X_test_fin[i, :]= extract_features(X_test[i], freqs)

In [15]:
predictions = clf.predict(X_test_fin)

In [16]:
from sklearn.metrics import accuracy_score
lr_score=accuracy_score(predictions, y_test)
lr_score

0.6909

In [17]:
def predict_review(my_review):
    my_review_fin = np.zeros((1, 3))
    my_review_fin = extract_features(my_review, freqs)
    y_hat = clf.predict(my_review_fin)

    if y_hat == 1:
        return 'Positive sentiment'
    else: 
        return 'Negative sentiment'

In [None]:
my_review = input("Enter your review: ")
print(predict_review(my_review))