<a href="https://colab.research.google.com/github/hayden-huynh/Rotten-Tomatoes-Review-Classifier/blob/master/RT_Reviews_NaiveBayes_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment Process

1. Download the Rotten Tomatoes Reviews [dataset](https://www.kaggle.com/datasets/ulrikthygepedersen/rotten-tomatoes-reviews)
2. Text data pre-processing:
  - Lower-casing
  - Punctuation Removal
  - Tokenization
3. Split the original dataset into smaller *train* (70%), *dev* (10%), and *test* (20%) datasets
4. Train the classifier using *train* dataset
  - Calculate and store P(fresh) and P(rotten) priors
  - Calculate and store likelihoods of words
5. Improve the classifier using *dev* dataset
  - Smoothing
  - Float Probability vs Log Probability
  - Stemming and Lemmatization (?)
6. Test accuracy of the classifier using *test* dataset 

# Download Dataset from Kaggle

In [221]:
# Download the Rotten Tomatoes Reviews dataset from Kaggle
# Reference 1 (Ref 1): https://www.analyticsvidhya.com/blog/2021/06/how-to-load-kaggle-datasets-directly-into-google-colab/

# Ref 1 starts =====
! pip install kaggle
! mkdir ~/.kaggle
! cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle datasets download ulrikthygepedersen/rotten-tomatoes-reviews
! unzip rotten-tomatoes-reviews.zip
# ===== Ref 1 ends

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
mkdir: cannot create directory ‘/root/.kaggle’: File exists
rotten-tomatoes-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  rotten-tomatoes-reviews.zip
replace rt_reviews.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: rt_reviews.csv          


# Text Pre-processing

In [237]:
# Reference 2 (Ref 2): https://www.analyticsvidhya.com/blog/2021/06/text-preprocessing-in-nlp-with-python-codes/
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

reviews = pd.read_csv("/content/rt_reviews.csv", encoding="latin-1")
nltk.download('stopwords')
nltk.download('wordnet')

print("----- Samples -----")
print(reviews.head(5))

print("\n----- Summary -----")
print(reviews.describe())

print("\n----- NLTK Stopwords -----")
nltk_stop_words = stopwords.words('english')
stop_words = ['the','a','and','of','is','to','it','in','that','its']
stop_words += ['with','but','this','for','as','an','on','be']
stop_words += ['film','movie','not','you','at','by','from','are','has','more','like','than']
stop_words += ['one','about','his','all','if','have','so','or']
stop_words += ['story','what','into','just','up','even']
stop_words += ['i','good','films','some','out']
# stop_words += ['much','was','which','who','will','can','time','their','too','only','there','doesnt','make']
# stop_words += ['full','way','while','when','makes','been','characters','comedy','most','any','theres']
# stop_words += ['no','life','review','director','movies','us','feels','enough','would','they','isnt']
# stop_words += ['may','he','work','also','could','really','thats','action','how']
# stop_words += ['very','we','drama','still','get','here','plot','do']
# stop_words += ['spanish','performance','performances','might','many','nothing','two','something','should']
# stop_words += ['her','your','made','cant','them','off','does','first','never','little']
# stop_words += ['½ï','both','see','seems','being','through','script','over','dont']
# stop_words += ['new','world','those','end','long','funny','well','character','without']
print(stop_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


----- Samples -----
  Freshness                                             Review
0     fresh   Manakamana doesn't answer any questions, yet ...
1     fresh   Wilfully offensive and powered by a chest-thu...
2    rotten   It would be difficult to imagine material mor...
3    rotten   Despite the gusto its star brings to the role...
4    rotten   If there was a good idea at the core of this ...

----- Summary -----
       Freshness                    Review
count     480000                    480000
unique         2                    339697
top        fresh   Parental Content Review
freq      240000                       166

----- NLTK Stopwords -----
['the', 'a', 'and', 'of', 'is', 'to', 'it', 'in', 'that', 'its', 'with', 'but', 'this', 'for', 'as', 'an', 'on', 'be', 'film', 'movie', 'not', 'you', 'at', 'by', 'from', 'are', 'has', 'more', 'like', 'than', 'one', 'about', 'his', 'all', 'if', 'have', 'so', 'or', 'story', 'what', 'into', 'just', 'up', 'even', 'i', 'good', 'films', 'some

In [238]:
# Lower-case all words
reviews["Review"] = reviews["Review"].apply(lambda r: r.lower())


# Remove Punctuations
def remove_punctuations_1(text):
  punc_free = "".join([char for char in text if char not in string.punctuation])
  return punc_free

def remove_punctuations_2(words):
  for i, w in enumerate(words):
    punc_free = "".join([char for char in w if char not in string.punctuation])
    words[i] = punc_free
  return words

reviews["Review_tokens"] = reviews["Review"].apply(lambda r: remove_punctuations_1(r))

# nltk_stop_words = list(map(lambda r: remove_punctuations_1(r), nltk_stop_words))
# stop_words += nltk_stop_words


# Tokenizing and Removing Duplicate Words
def tokenize(text):
  tokens = re.split("\W+", text)
  tokens = list(filter(None, tokens))
  return sorted(list(set(tokens)))

reviews["Review_tokens"] = reviews["Review_tokens"].apply(lambda r: tokenize(r))


# Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatize(words):
  lemm_words = [wordnet_lemmatizer.lemmatize(word) for word in words]
  return list(set(lemm_words))

reviews["Review_tokens"] = reviews["Review_tokens"].apply(lambda words: lemmatize(words))


# Remove stopwords
def remove_stopwords(words):
  output = [w for w in words if w not in stop_words]
  return output

reviews["Review_tokens"] = reviews["Review_tokens"].apply(lambda words: remove_stopwords(words))


reviews.sample(5)

Unnamed: 0,Freshness,Review,Review_tokens
432675,fresh,the best thing about the ultimately involving...,"[make, outcome, already, best, end, look, enou..."
117502,rotten,a typically underwhelming effort from david o...,"[underwhelming, effort, russell, typically, da..."
65329,fresh,exactly the biopic barnum would've made about...,"[forget, barnum, wouldve, aisle, biopic, way, ..."
188852,fresh,"it's enjoyable and fun to watch, but just bar...","[own, watch, enjoyable, barely, fun, surface, ..."
46632,rotten,at least with the jackass films you could sen...,"[work, could, roller, broken, down, poorly, se..."


# Split the Original Dataset

In [239]:
# Reference 3 (Ref 3): https://stackoverflow.com/questions/43777243/how-to-split-a-dataframe-in-pandas-in-predefined-percentages 

# Ref 3 starts =====
def split_by_fractions(df, fracs, random_state=0):
    remain = df.index.copy().to_frame()
    res = []
    for i in range(len(fracs)):
        fractions_sum = sum(fracs[i:])
        frac = fracs[i]/fractions_sum
        idxs = remain.sample(frac=frac, random_state=random_state).index
        remain=remain.drop(idxs)
        res.append(idxs)
    return [df.loc[idxs] for idxs in res]
# Ref 3 ends =====

random_state = 1
train, dev, test = split_by_fractions(reviews, [0.7, 0.1, 0.2], random_state)
print(train.shape, dev.shape, test.shape)

(336000, 3) (48000, 3) (96000, 3)


# Training

In [240]:
from decimal import Decimal

train_fresh = train.loc[train["Freshness"] == "fresh"]
train_rotten = train.loc[train["Freshness"] == "rotten"]

# P(fresh) and P(rotten) priors
p_fresh = Decimal(len(train_fresh) / len(train))
p_rotten = Decimal(len(train_rotten) / len(train))

print(f'P(fresh) = {p_fresh}')
print(f'P(rotten) = {p_rotten}')

P(fresh) = 0.50085119047619042209618100969237275421619415283203125
P(rotten) = 0.499148809523809522392667759049800224602222442626953125


In [241]:
# Count word occurences
occ_fresh = {}
occ_rotten = {}

for words in train_fresh.loc[:,"Review_tokens"]:
  for w in words:
    if w not in occ_fresh.keys():
      occ_fresh[w] = 1
    else:
      occ_fresh[w] += 1

for words in train_rotten.loc[:,"Review_tokens"]:
  for w in words:
    if w not in occ_rotten.keys():
      occ_rotten[w] = 1
    else:
      occ_rotten[w] += 1

In [242]:
# Calculate word probabilities given fresh or rotten
probs_fresh = {}
probs_rotten = {}

def calc_word_likelihood(count, alpha, h):
  if h == "fresh":
    return Decimal((count + alpha) / (len(train_fresh) + alpha * len(occ_fresh)))
  elif h == "rotten":
    return Decimal((count + alpha) / (len(train_rotten) + alpha * len(occ_rotten)))

def calc_prob(alpha=0):
  for word, count in occ_fresh.items():
    probs_fresh[word] = calc_word_likelihood(count, alpha, "fresh")
  
  for word, count in occ_rotten.items():
    probs_rotten[word] = calc_word_likelihood(count, alpha, "rotten")

alpha = 1

calc_prob(alpha)



# Derive top 10 most influential words for each class

probs_fresh_sorted = dict(sorted(probs_fresh.items(), key=lambda item: item[1], reverse=True))
probs_rotten_sorted = dict(sorted(probs_rotten.items(), key=lambda item: item[1], reverse=True))

print("----- Fresh Top 10 -----")
i = 0
for key, value in probs_fresh_sorted.items():
  print(f"{key}: {value}")
  i += 1
  if i == 10:
    break

print("\n----- Rotten Top 10 -----")
j = 0
for key, value in probs_rotten_sorted.items():
  print(f"{key}: {value}")
  j += 1
  if j == 10:
    break

----- Fresh Top 10 -----
ha: 0.0465814100097428351876516217089374549686908721923828125
make: 0.038643862421299975118405001239807461388409137725830078125
most: 0.034124495460164890048293528934664209373295307159423828125
performance: 0.033690570733824025395630741286367992870509624481201171875
there: 0.032986466460893558438893791162627167068421840667724609375
who: 0.031762471242252808545369902049060328863561153411865234375
time: 0.0303256072899354030270036020056068082340061664581298828125
character: 0.030034959595876896398802813337169936858117580413818359375
will: 0.028692249121916472975879486284611630253493785858154296875
best: 0.0274641602737819394419727103695549885742366313934326171875

----- Rotten Top 10 -----
there: 0.0448562471038602694761721068061888217926025390625
ha: 0.04412568933343909149957795534646720625460147857666015625
much: 0.03860267258905498988230675649901968427002429962158203125
make: 0.0370998108899028566920463845235644839704036712646484375
too: 0.03701631857328329527

# Experimenting with *dev* dataset

In [243]:
import csv
import os

# Function to classify a review
def classify(review_words, alpha):
  for w in review_words:
    if w not in probs_fresh.keys():
      probs_fresh[w] = calc_word_likelihood(0, alpha, "fresh")
    if w not in probs_rotten.keys():
      probs_rotten[w] = calc_word_likelihood(0, alpha, "rotten")
  
  chance_fresh = p_fresh
  chance_rotten = p_rotten
  for w in review_words:
    chance_fresh = chance_fresh * probs_fresh[w]
    chance_rotten = chance_rotten * probs_rotten[w]
  
  if chance_fresh > chance_rotten:
    return "fresh"
  else:
    return "rotten"


# Function to classify a review, log10 applied to avoid underflowing floats
def classify_log(review_words, alpha):
  for w in review_words:
    if w not in probs_fresh.keys():
      probs_fresh[w] = calc_word_likelihood(0, alpha, "fresh")
    if w not in probs_rotten.keys():
      probs_rotten[w] = calc_word_likelihood(0, alpha, "rotten")

  chance_fresh = p_fresh.log10()
  chance_rotten = p_rotten.log10()
  for w in review_words:
    chance_fresh = chance_fresh + probs_fresh[w].log10()
    chance_rotten = chance_rotten + probs_rotten[w].log10()

  if chance_fresh > chance_rotten:
    return "fresh"
  else:
    return "rotten"


# Function to test entire dataset given
def test_accuracy(dataset, alpha, use_log=False, csv_writer=None):
  correct = 0
  
  for index, row in dataset.loc[:,["Freshness", "Review_tokens"]].iterrows():
    result = ""

    if use_log:
      result = classify_log(row["Review_tokens"], alpha)
    else:
      result = classify(row["Review_tokens"], alpha)
    
    if row["Freshness"] == result:
      correct += 1
  
  accuracy = round(correct / len(dataset) * 100, 4)

  if csv_writer != None:
    csv_writer.writerow([alpha, accuracy])
  
  print(f"Successfully classified {correct}/{len(dataset)} ({accuracy}%) correctly")



# Experiment with Smoothing

# dev_smoothing = open(f"dev_smoothing_log.csv", "a", newline='')
# dev_smoothing_writer = csv.writer(dev_smoothing)
# if (os.path.getsize(f"/content/dev_smoothing_log.csv") == 0):
#   dev_smoothing_writer.writerow(["alpha", "accuracy"])

test_accuracy(dev, alpha)
# dev_smoothing.flush()

Successfully classified 38561/48000 (80.3354%) correctly


# Final Accuracy with *test* dataset

In [244]:
# test_accuracy(test, alpha)