## Dataset: https://www.kaggle.com/datasets/yasserh/imdb-movie-ratings-sentiment-analysis

In [1]:
import pandas as pd
import string
import re
import nltk
import time
from nltk.corpus import stopwords
from zipfile import ZipFile

In [2]:
with ZipFile('imdb ratings.zip') as zf:
    f = zf.open('movie.csv')
    df = pd.read_csv(f)

In [3]:
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,0
1,"When I put this movie in my DVD player, and sa...",0
2,Why do people who do not know what a particula...,0
3,Even though I have great interest in Biblical ...,0
4,Im a die hard Dads Army fan and nothing will e...,1


## cleaning the review strings

In [4]:
print(df.shape) #before removing duplicates
df = df.drop_duplicates(ignore_index = True)
print(df.shape) #after removing duplicates

(40000, 2)
(39723, 2)


In [5]:
len(df[:2000]) 

2000

In [6]:
#changing the label: 0 = negative, 1 = positive
df.loc[df['label'] == 0, 'label'] = 'neg'
df.loc[df['label'] == 1, 'label'] = 'pos'
df.head()

Unnamed: 0,text,label
0,I grew up (b. 1965) watching and loving the Th...,neg
1,"When I put this movie in my DVD player, and sa...",neg
2,Why do people who do not know what a particula...,neg
3,Even though I have great interest in Biblical ...,neg
4,Im a die hard Dads Army fan and nothing will e...,pos


In [7]:
# define a function that cleans up each string
def word_processor(text):
    text = "".join([i.lower() for i in text.replace('<br', '') if i not in string.punctuation]) # lower case, remove <br, remove punctuation, remove white spaces
    text = re.sub('\s+',' ', text)
    #text = ' '.join([PorterStemmer().stem(word) for word in text.split()])  #DO I NEED TO STEM THE WORDS?
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

In [8]:
#make a list of tuples --> [([words], corresponding class label)] to train the model on

test_list = []

start = time.time()
for i in range(len(df[:2000])):
    new_string = word_processor(df.iloc[i]['text'])
    sentiment = df.iloc[i]['label']
    test_list.append((new_string.split(' '), sentiment))
end = time.time()

In [9]:
#seeing how long it takes to prepare my list
duration = end - start
print(duration)

91.63279128074646


## defining a feature extractor to use to train my model

In [10]:
import random
random.shuffle(test_list)

In [11]:
#get the 2000 most common words out of the first 2000 rows from the original df
review_words = [i[0] for i in test_list]
words_flat = [word for rev in review_words for word in rev] #merging the list of lists into one giant list

all_words = nltk.FreqDist(w.lower() for w in words_flat) 
comm_words = list(all_words)[:2000]

In [12]:
#feature extractor

def review_features(review):
    words = set(review) #decomposing the review into unique words
    features = {}
    for w in comm_words:
        features[f'contains({w})'] = (w in words)
    return features

## training Naive Bayes classifier

In [13]:
#featuresets will be used to train the model that if a list of words contains some words and does not contain other words
#from the 2000 most common words, that list of words should be paired with its corresponding sentiment.
featuresets = [(review_features(rev), label) for (rev, label) in test_list]

train_set, test_set = featuresets[:1500], featuresets[1500:]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [14]:
print('NB model accuracy:', nltk.classify.accuracy(classifier, test_set))

NB model accuracy: 0.834


In [15]:
classifier.show_most_informative_features(5)

Most Informative Features
        contains(wasted) = True              neg : pos    =     15.2 : 1.0
          contains(mess) = True              neg : pos    =     12.2 : 1.0
         contains(worst) = True              neg : pos    =     11.8 : 1.0
         contains(waste) = True              neg : pos    =     11.6 : 1.0
        contains(finest) = True              pos : neg    =     11.3 : 1.0


In [16]:
#Question: do I need to store this classifier by pickling it? 