# IMDB dataset | Sentiment Analysis

## Imports

In [33]:
import math
import numpy as np
import pandas as pd
import nltk
import random
nltk.download('stopwords')
import re
import string 

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/irenzo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')

## Data check

In [4]:
# Check head of data
train.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


In [6]:
train.describe()

Unnamed: 0,label
count,40000.0
mean,0.499525
std,0.500006
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [7]:
test.describe()

Unnamed: 0,label
count,5000.0
mean,0.501
std,0.500049
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [28]:
# Is our train dataset balanced? *spoiler -> yes
label_counts = train['label'].value_counts()
print(f'% of 0s: {round((label_counts.get(0,0)/len(train))*100)}')
print(f'% of 1s: {round((label_counts.get(1,0)/len(train))*100)}')

% of 0s: 50
% of 1s: 50


In [29]:
# Is our test dataset balanced? *spoiler -> yes
label_counts = test['label'].value_counts()
print(f'% of 0s: {round((label_counts.get(0,0)/len(test))*100)}')
print(f'% of 1s: {round((label_counts.get(1,0)/len(test))*100)}')

% of 0s: 50
% of 1s: 50


## Data split (x/y)

In [42]:
train_x = train['text']
train_y = train['label']

test_x = test['text']
test_y = test['label']

## Process text

In [43]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
stemmer = PorterStemmer()
stop_words = stopwords.words('english')

In [44]:
def process_text(text):
    # Check if the tweet is a string
    if not isinstance(text, str):
        return []

    # Remove hyperlinks
    tweet = re.sub(r'https?://[^\s\n\r]+', '', text)
    # Remove hashtags
    tweet = re.sub(r'#', '', text)
    # Tokenize tweet
    tokens = tokenizer.tokenize(text)
    # Remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # Remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # Filter out stop words
    words = [w for w in words if not w in stop_words]
    # Stemming
    stemmed = [stemmer.stem(word) for word in words]
    return stemmed

In [45]:
custom_text = "I h@te this movie so much! #worst_movie_ever https://reviews.com @movie_maker"
print(process_text(custom_text))

['h', 'te', 'movi', 'much', 'worstmovieev', 'httpsreviewscom']


## Count words

In [46]:
def count_words(result, text, ys):
    if result is None:
        result = {}
    for y, t in zip(ys, text):
        for word in process_text(t):
            # define the key, which is the word and label tuple
            pair = (word, y)
            # if the key exists in the dictionary, increment the count
            if pair in result:
                result[pair] += 1
            # else, if the key is new, add it to the dictionary and set the count to 1
            else:
                result[pair] = 1
    return result

In [47]:
result = {}
text = ['i am happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 0, 0, 0]
count_words(result, text, ys)

{('happi', 1): 1, ('trick', 0): 1, ('sad', 0): 1, ('tire', 0): 2}

## Frequency dict
How often each word occur in train set + sentiment of the word

In [48]:
freqs = count_words({}, train_x, train_y)
freqs

{('grew', 0): 134,
 ('b', 0): 752,
 ('watch', 0): 11929,
 ('love', 0): 4572,
 ('thunderbird', 0): 65,
 ('mate', 0): 144,
 ('school', 0): 1442,
 ('play', 0): 5771,
 ('lunch', 0): 50,
 ('want', 0): 5687,
 ('virgil', 0): 15,
 ('scott', 0): 361,
 ('one', 0): 21114,
 ('alan', 0): 195,
 ('count', 0): 453,
 ('becam', 0): 475,
 ('art', 0): 1020,
 ('form', 0): 591,
 ('took', 0): 908,
 ('children', 0): 1039,
 ('see', 0): 10391,
 ('movi', 0): 46032,
 ('hope', 0): 2058,
 ('would', 0): 10913,
 ('get', 0): 12187,
 ('glimps', 0): 126,
 ('child', 0): 869,
 ('bitterli', 0): 24,
 ('disappoint', 0): 1992,
 ('high', 0): 1563,
 ('point', 0): 3603,
 ('snappi', 0): 26,
 ('theme', 0): 694,
 ('tune', 0): 220,
 ('could', 0): 7245,
 ('compar', 0): 815,
 ('origin', 0): 3443,
 ('score', 0): 777,
 ('thank', 0): 745,
 ('earli', 0): 1073,
 ('saturday', 0): 174,
 ('morn', 0): 212,
 ('televis', 0): 601,
 ('channel', 0): 546,
 ('still', 0): 3619,
 ('rerun', 0): 77,
 ('seri', 0): 1914,
 ('gerri', 0): 18,
 ('anderson', 0)

# Train NB

In [49]:
def train_naive_bayes(freqs, train_x, train_y):
    loglikelihood = {}
    logprior = 0

    # Calculate V, the number of unique words in the vocabulary
    vocab = set([key[0] for key in freqs.keys()])
    V = len(vocab)

    # Calculate N_pos and N_neg
    N_pos = N_neg = 0
    for (word, sentiment), count in freqs.items():
        if sentiment == 1:
            N_pos += count
        else:
            N_neg += count

    # Calculate D, the number of documents
    D = len(train_x)

    # Calculate D_pos and D_neg, the number of positive and negative documents
    D_pos = sum(train_y)
    D_neg = D - D_pos

    # Calculate logprior
    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos = freq_neg = 0
        for (w, sentiment), count in freqs.items():
            if w == word:
                if sentiment == 1:
                    freq_pos += count
                else:
                    freq_neg += count

        # calculate the probability that each word is positive, and negative
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)

        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos) - np.log(p_w_neg)

    return logprior, loglikelihood

In [50]:
logprior, loglikelihood = train_naive_bayes(freqs, train_x, train_y)
print(logprior)
print(len(loglikelihood))

-0.001900000571582794
89541


# Test NB

In [51]:
def naive_bayes_predict(text, logprior, loglikelihood):
    # process the review to get a list of words
    word_l = process_text(text)
    p = 0
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += (loglikelihood[word])

    return p

In [52]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood, naive_bayes_predict=naive_bayes_predict):
    accuracy = 0 

    y_hats = []
    for j in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(j, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)

    # error is the average of the absolute values of the differences between y_hats and test_y
    
    errors = sum([1 for y_hat, y in zip(y_hats, test_y) if y_hat != y])
    
    # Total number of reviews
    total_reviews = len(test_y)

    # Calculate error as the number of wrong predictions divided by the total number of reviews
    error = errors / total_reviews

    # Accuracy is 1 minus the error
    accuracy = 1 - error

    return accuracy

## Results

In [59]:
results = test_naive_bayes(test_x, test_y, logprior, loglikelihood, naive_bayes_predict=naive_bayes_predict)
print(f'Accuracy of the model: {results}')

Accuracy of the model: 0.8606


In [54]:
my_review = 'I would say that this was the most awe... wait for it.. some move of all the times!'

p = naive_bayes_predict(my_review, logprior, loglikelihood)
print(p)

0.43732479375137334


In [55]:
my_review = 'Hate this film. Waste of time:(((('

p = naive_bayes_predict(my_review, logprior, loglikelihood)
print(p)

-2.5587916725212834


In [58]:
my_review = 'good movie'

p = naive_bayes_predict(my_review, logprior, loglikelihood)
print(p)

-0.2927880261807836
