# Rotten Tomatoes Sentiment Classification
Rotten Tomatoes Dataset (https://www.kaggle.com/datasets/andrezaza/clapper-massive-rotten-tomatoes-movies-and-reviews).


## Section 0: Importing libraries

Add all of the imports you will use in this analysis here.  If you choose to install any other libraries, make sure to keep that code in this section.

In [1]:
!pip install gdown
!pip install sweetviz
!pip install fast-langdetect
!pip install langdetect
!pip install contractions
!pip install spacy
!pip install joblin

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Downloading Spacy
!python -m spacy download es_core_news_sm
!python -m spacy download fr_core_news_sm

# Downloading NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')

from concurrent.futures import ProcessPoolExecutor
from collections import Counter
from IPython.display import display, HTML # Need for inline HTML display
from fast_langdetect import detect # for speed
from langdetect import LangDetectException
from textblob import TextBlob
from spacy.util import minibatch
from joblib import Parallel, delayed # for speed
from pprint import pprint
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score as sklearn_f1_score, fbeta_score, precision_recall_fscore_support
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, precision_recall_curve, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import fasttext
import fasttext.util

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gdown
import numpy as np
import scipy.stats as stats
import sweetviz as sv
import string
import re
import html
import contractions
import textblob
import spacy
import pickle
import torch
import requests
import tabulate

Collecting sweetviz
  Downloading sweetviz-2.3.1-py3-none-any.whl.metadata (24 kB)
Downloading sweetviz-2.3.1-py3-none-any.whl (15.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sweetviz
Successfully installed sweetviz-2.3.1
Collecting fast-langdetect
  Downloading fast_langdetect-0.2.2-py3-none-any.whl.metadata (5.3 kB)
Collecting fasttext-wheel>=0.9.2 (from fast-langdetect)
  Downloading fasttext_wheel-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting robust-downloader>=0.0.2 (from fast-langdetect)
  Downloading robust_downloader-0.0.2-py3-none-any.whl.metadata (4.1 kB)
Collecting pybind11>=2.2 (from fasttext-wheel>=0.9.2->fast-langdetect)
  Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Collecting colorlog (from robust-downloader>=0.0.2->fast-langdetect)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


## Section 1: Getting Data

1) Download data from [here](https://drive.google.com/file/d/1DbyBgw3riR__gdEk1y2Xpo2VbBbXn3Eh/view?usp=sharing).  Note: this may take 15-20 seconds or so.

2) Upload the data in this Colab environment (heads up: you will lose this dataset when you restart your session)

3) Read the dataset into this notebook

In [2]:
# Download ID for Rotten Tomatoes Reviews
download_id = '1DbyBgw3riR__gdEk1y2Xpo2VbBbXn3Eh'
download_url = f'https://drive.google.com/uc?id={download_id}'

# Downloading to output path
output_path = 'tomato_reviews.csv'
gdown.download(download_url, output_path, quiet=False)
RT_df = pd.read_csv(output_path) # Rotten Tomatoes Review DF
RT_df.head()

Downloading...
From: https://drive.google.com/uc?id=1DbyBgw3riR__gdEk1y2Xpo2VbBbXn3Eh
To: /content/tomato_reviews.csv
100%|██████████| 94.4M/94.4M [00:01<00:00, 72.8MB/s]


Unnamed: 0,id,reviewId,creationDate,criticName,isTopCritic,originalScore,reviewState,publicatioName,reviewText,scoreSentiment,reviewUrl
0,small_town_wisconsin,102711819,2022-07-22,Peter Gray,False,,fresh,This is Film,Small Town Wisconsin could hit some home truth...,POSITIVE,https://thisisfilm.com/review/small-town-wisco...
1,small_town_wisconsin,102711545,2022-07-22,Tim Grierson,True,,fresh,Screen International,This low-key drama has lovely interludes and s...,POSITIVE,https://www.screendaily.com/reviews/small-town...
2,small_town_wisconsin,102700937,2022-06-16,Sumner Forbes,False,8.5/10,fresh,Film Threat,Small Town Wisconsin is a success in almost ev...,POSITIVE,https://filmthreat.com/reviews/small-town-wisc...
3,small_town_wisconsin,102699897,2022-06-14,Tara McNamara,False,3/5,fresh,Common Sense Media,Just like Wayne&#44; Small Town Wisconsin has ...,POSITIVE,https://www.commonsensemedia.org/movie-reviews...
4,small_town_wisconsin,102698744,2022-06-10,Rob Thomas,False,3/4,fresh,"Capital Times (Madison, WI)",It&#8217;s a movie with its heart in the right...,POSITIVE,https://captimes.com/entertainment/screens/sma...


In [6]:
# Set the option to display the full content of the 'reviewText' column
pd.set_option('display.max_colwidth', None)

# Filter rows containing the word "arrival" in the 'id' column (case insensitive)
arrival_ids = RT_df[RT_df['id'].str.contains('arrival', case=False, na=False)]

# Show only the 'publicationName' and 'reviewText' columns
arrival_ids_filtered = arrival_ids[['publicatioName', 'reviewText']]

# Pretty print the result using tabulate
print(arrival_ids_filtered.to_string(index=False))

                           publicatioName                                                                                                                                                                                                                                                                 reviewText
                      The Virginian-Pilot                                                                                                                                                      E.T. would find it false propaganda, but for fans of the genre, it's standard sci-fi with all the needed ingredients.
                      Gone With The Twins                                                                                                                    The film starts to fall apart at the introduction of the aliens themselves, which suffer from dated special effects and an immediate aura of disbelief.
                               Cinegarage                                

4) Print the column names and number of rows/columns in the dataset

In [3]:
# Printing the RT_df column names with formatting
print("Rotten Tomato Columns:")
pprint(list(RT_df.columns))

Rotten Tomato Columns:
['id',
 'reviewId',
 'creationDate',
 'criticName',
 'isTopCritic',
 'originalScore',
 'reviewState',
 'publicatioName',
 'reviewText',
 'scoreSentiment',
 'reviewUrl']


In [4]:
# Printing the RT_df number of rows/columns with formatting
rows, cols = RT_df.shape
print(f"The DataFrame has {rows} rows and {cols} columns.")

The DataFrame has 294679 rows and 11 columns.


## EDA

1) What time frame is represented in this dataset?

In [5]:
# What are the review states/is there a reviewed state?
print(RT_df['reviewState'].value_counts()) # All completed reviews

reviewState
fresh     219628
rotten     75051
Name: count, dtype: int64


In [6]:
# Converting to time format
RT_df['creationDate'] = pd.to_datetime(RT_df['creationDate'])

# Generating first review, last review, and total time range
# First review (earliest creationDate)
first_review_time = RT_df['creationDate'].min().date()
# Last review (latest creationDate)
last_review_time = RT_df['creationDate'].max().date()

time_range = (last_review_time - first_review_time).days

print(f"First Review Time: {first_review_time}")
print(f"Last Review Time: {last_review_time}")
print(f"Total Time Range: {time_range} days")

First Review Time: 2020-01-02
Last Review Time: 2023-04-08
Total Time Range: 1192 days


2) How many top critics are there?

In [7]:
# What are the itTopCritic Values
RT_df['isTopCritic'].value_counts()

Unnamed: 0_level_0,count
isTopCritic,Unnamed: 1_level_1
False,233965
True,60714


In [8]:
# How Many Top Critics are there
print(f"There are {RT_df['isTopCritic'].sum()} top critics")

There are 60714 top critics


3) Do top Critics rate more favorably than regular critics?  Hint: Use `scoreSentiment` to answer

In [9]:
def confidence_interval(data, confidence_level=0.95):
    # Calculate sample mean, standard deviation, and sample size
    mean = np.mean(data)
    std = np.std(data, ddof=1)  # Sample standard deviation
    n = len(data)

    # Calculate the margin of error
    # Using a PPF for a point % vs. cumulative distribution < or >
    z_score = stats.norm.ppf(1 - (1 - confidence_level) / 2)
    margin_of_error = z_score * (std / np.sqrt(n))

    # Calculate confidence interval
    lower_bound = mean - margin_of_error
    upper_bound = mean + margin_of_error

    return lower_bound, upper_bound, mean, margin_of_error

# Need to turn sentiment into numerical encoding
sentiment_mapping = {
    'POSITIVE': 1,
    'NEGATIVE': -1
}

RT_df['scoreSentimentNumeric'] = RT_df['scoreSentiment'].map(sentiment_mapping).astype(int)

In [None]:
# Filter the data for top critics and regular critics
top_critics = RT_df[RT_df['isTopCritic'] == True]['scoreSentimentNumeric']
regular_critics = RT_df[RT_df['isTopCritic'] == False]['scoreSentimentNumeric']

top_critics_ci_lower, top_critics_ci_upper, top_critics_mean, top_critics_ME = confidence_interval(top_critics)
regular_critics_ci_lower, regular_critics_ci_upper, regular_critics_mean, regular_critics_ME = confidence_interval(regular_critics)

print(f"Top Critics - Mean sentiment: {top_critics_mean:.2f}, CI: ({top_critics_ci_lower:.3f}, {top_critics_ci_upper:.3f})")
print(f"Regular Critics - Mean sentiment: {regular_critics_mean:.2f}, CI: ({regular_critics_ci_lower:.3f}, {regular_critics_ci_upper:.3f})")

In [None]:
# Using the Confidence Interval to make a more attuned reccomendation
# Compare the confidence intervals
if top_critics_ci_upper < regular_critics_ci_lower:
    print("Regular critics rate more favorably (no overlap in CI).")
elif regular_critics_ci_upper < top_critics_ci_lower:
    print("Top critics rate more favorably (no overlap in CI).")
else:
    print("There is overlap in the confidence intervals, so we cannot "
    "confidently say whether top or regular critics rate more favorably.")

4) (Not required) Is there anything else you want to explore? Feel free to add more cells as needed!

## Visualization

1. Visualize the top 10 publications by number of reviews

In [None]:
# Getting the top 10 publications ordered by # Reviews
publication_counts = RT_df['publicatioName'].value_counts().head(10)
publication_counts = publication_counts.round().astype(int)
publication_counts

In [None]:
# Setting up the visualizations
plt.figure(figsize=(10, 6))
sns.set(style="whitegrid")
# Create a bar plot
ax = sns.barplot(x=publication_counts.values, y=publication_counts.index, palette="coolwarm")

plt.title('Top 10 Publications by Number of Reviews', fontsize=16, weight='bold', color='navy') # Navy color to match coolwarm
plt.xlabel('Number of Reviews', fontsize=14, weight='bold', color='darkslategray')
plt.ylabel('Publication Name', fontsize=14, weight='bold', color='darkslategray')

# Labels on each bar
for p in ax.patches:
    # .get_height() / 2 to place label in middle of bar
    # .get_width() used to get right edge of each bar
    ax.annotate(f'{int(p.get_width())}', (p.get_width() + 0.2, p.get_y() + p.get_height() / 2),
                ha='center', va='center', fontsize=12, color='black')

plt.tight_layout() # Avoids overlapping margins and white space
plt.show()

2. (Not required) Is there anything else you would like to visulalize? Feel free to add more cells as needed!

## Sweetviz Overview
Sweetviz is a fantastic tool for accelerating the EDA process by providing rich, intuitive visualizations, helping data scientists and analysts understand their datasets more effectively.

It can also compare two datasets to see differences quickly for intricate drift analysis. Supports addition of a target variable as well.

## Takeaway from the Sweetviz Report
**Multiple Rating Systems in the 'Original Score' Column**:

 - The **'Original Score'** column appears to contain different reveiw scales, indicating a need for **standardization**. This is worthy of additional exploration.

 - The **'Original Score'** column is also missing a high-degree of data (39%)

In [None]:
# Sweet Visualizer Report - addition of target of scoreSentiment
report = sv.analyze(RT_df)

In [None]:
# Generate HTML report
report.show_html('RT_df_report.html')  # Save report as HTML file

# Use ipython functions to visualize in notebook
display(HTML('RT_df_report.html'))  # Sweetviz .show_notbook() might be possible

## NLP

# Text Preprocessing Pipeline

### Pre-Language Detection Text Cleaning Overview

1. **HTML Encoding Removal**:
   - The function decodes HTML entities (e.g., `&#46;` becomes `.`) to ensure that the review text is in a readable format for all downstream steps
   
2. **Non-ASCII Character Removal**:
   - It removes non-ASCII characters (such as hidden or special whitespace characters) to avoid issues with unexpected or non-standard characters.
   
3. **New Line Character Handling**:
   - Multiple new lines are replaced with a single space to prevent fragmented sentences and maintain the text flow.

4. **Lower Case, Contraction Expansion**
   - Tested as increasing performance of langdetect

5. **Bracket Removal**
   - This is normally done for spelling correction in journalism

### Confidence Score for English Classification in `langdetect`

When using the **langdetect** library to classify whether a review is in English, a **confidence score** is returned along with the detected language. This score indicates the level of certainty the language detection algorithm has about the classification. Lang Detect is used to remove stop words in the appropriate language (english, french, and spanish supported)

5.  **Stop Word Removal**
    - Stop word removal is done across **English, Spanish, and French**

6. **Lemmatization**
    - This will convert words to their base form. This increases success of bi-gram calculation and sentiment analysis as it preserves meaning (e.g. 'Man is running' and 'Man ran' are the same)
    - Considered retaining ? and ! for sentiment analysis, but will do future improvement

7. **Original Score Clean-up**
    - Handling a variety of diverse scores and cleaning them up into a 0-1 scale for downstream regression



In [None]:
# Function to clean the text: designed for preprocessing to language detection
def clean_text_lang(text):
    if isinstance(text, str):
        # Convert HTML entities to their corresponding characters
        text = html.unescape(text)

        # Expand contractions
        text = contractions.fix(text)

        # Convert text to lowercase
        # Tested as appropriate with langdetect and improves performance.
        text = text.lower()

        # Remove any non-ASCII characters (e.g., special characters, zero-width spaces)
        text = re.sub(r'[^\x00-\x7F]+', '', text)

        # Remove newline characters and carriage returns
        text = re.sub(r'\n+', ' ', text)  # Replaces newlines with space
        text = re.sub(r'\r+', ' ', text)  # Removes carriage returns

        # Remove square brackets but keep anything between them - this is usually a spell correction
        text = re.sub(r'\[|\]', '', text)  # Removes '[' and ']'

        # Clean up extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()

        return text
    else:
        return ''  # Return an empty string for non-string values

# Get lang_code and is_english features by first cleaning the text
RT_df['cleaned_reviewText'] = RT_df['reviewText'].apply(lambda x: clean_text_lang(x))

In [None]:
# Function to detect the language of the review text optimized for large volumes
def detect_language(text):
    try:
        # Detect the language: dictionary with 'lang' and 'score'
        result = detect(text)
        lang = result['lang']
        score = result['score']
        is_english = lang == 'en'  # Check if the language is English
        return lang, score, is_english
    except LangDetectException as e:  # Handle langdetect-specific exceptions
        print(f"Error detecting language for text: {text[:100]}...")
        return None, 0, False

RT_df[['lang_code','lang_score','is_english']] = RT_df['cleaned_reviewText'].apply(lambda x: pd.Series(detect_language(x)))

non_english_rows = RT_df[RT_df['is_english'] == False]

# Checking total number of english detected rows
RT_df['is_english'].value_counts()

In [None]:
# Want only high confidence classifcation of FR and ES
RT_df['final_lang'] = RT_df.apply(
    lambda row: 'es' if row['lang_score'] > 0.8 and row['lang_code'] == 'es' else
                ('fr' if row['lang_score'] > 0.8 and row['lang_code'] == 'fr' else 'en'),
    axis=1
)

### Text Cleaning and English Classification Check

Below is a scrollable table that displays the original review text, the cleaned review text, and whether the review is in English or not. This allows for easy inspection of how the reviews were cleaned and if the language detection was accurate.


In [None]:
# Small number of spanish review and french reviews
for index, row in non_english_rows.iterrows():
    print(f"Lang Code: {row['lang_code']}")
    print(f"Lang Score: {row['lang_score']}")
    print(f"Is English: {row['is_english']}")
    print(f"Review Text: {row['reviewText']}")
    print(f"Cleaned Text: {row['cleaned_reviewText']}")
    print("\n" + "-"*50 + "\n")  # Separator between reviews

1) Remove all stop words from `reviewText`

In [None]:
# Load stopword lists for English, Spanish, and French
english_stopwords = set(stopwords.words('english'))
spanish_stopwords = set(stopwords.words('spanish'))
french_stopwords = set(stopwords.words('french'))
all_stopwords = english_stopwords.union(spanish_stopwords, french_stopwords)

# Function to expand contractions (It's) and stop words
def remove_stopwords(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Remove stopwords from the list
    filtered_words = [word for word in words if word.lower() not in all_stopwords]

    # Reassemble the text from the filtered words
    return ' '.join(filtered_words)

RT_df['cleaned_reviewText_no_stopwords'] = RT_df['cleaned_reviewText'].apply(remove_stopwords)

### English Stop Word Removal Check

In [None]:
# Unit testing (randomized) for appropriate stop-code removal
# 5 most common stopwords
stop_words = ['the', 'is', 'in', 'and', 'to']

# Filter rows where at least one stop word is present in the original review text
filtered_rows = RT_df[RT_df['cleaned_reviewText'].apply(lambda x: any(word in x.lower() for word in stop_words))]

# Randomly select 10 rows from the filtered rows
random_filtered_rows = filtered_rows.sample(n=10, random_state=42)  # You can adjust the number of rows as needed

# Loop through the randomly selected rows and display the results
for index, row in random_filtered_rows.iterrows():
    print(f"Original Review Text: {row['reviewText']}")
    print(f"Cleaned Review Text: {row['cleaned_reviewText']}")
    print(f"Cleaned Review Text (No Stopwords): {row['cleaned_reviewText_no_stopwords']}")
    print("\n" + "-"*50 + "\n")  # Separator between reviews

### Spanish Stop Word Removal Check


In [None]:
# Filter the rows where lang_code is 'es' and lang_score is greater than 0.90
filtered_spanish_reviews = RT_df[(RT_df['lang_code'] == 'es') & (RT_df['lang_score'] > 0.90)]

# Loop through the filtered rows and display the review texts and their language scores
for index, row in filtered_spanish_reviews.iterrows():
    print(f"Original Review Text: {row['reviewText']}")
    print(f"Cleaned Review Text: {row['cleaned_reviewText']}")
    print(f"Cleaned Review Text (No Stopwords): {row['cleaned_reviewText_no_stopwords']}")
    print("\n" + "-"*50 + "\n")  # Separator between reviews

2. What is the most common bigram in reviewText (after stopword removal)?

Note: Lemmatization is a higher performance option for better n-gram and sentiment analysis, but could take many hours to run.


In [None]:
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to remove punctuation, digits, and stop words
def remove_unwanted_tokens_with_pos(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Perform POS tagging
    tagged_tokens = nltk.pos_tag(tokens)

    # Keep only the tokens with useful POS tags (e.g., nouns, verbs, adjectives, adverbs)
    useful_pos_tags = {'NN', 'VB', 'JJ', 'RB', 'NNS', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}  # You can add more tags if needed
    #     'NN': 'Singular noun (e.g., dog)',        # Nouns in singular form
    #     'VB': 'Base form verb (e.g., run)',       # Verbs in base form
    #     'JJ': 'Adjective (e.g., big)',            # Adjectives describing nouns
    #     'RB': 'Adverb (e.g., quickly)',           # Adverbs modifying verbs, adjectives, or other adverbs
    #     'NNS': 'Plural noun (e.g., dogs)',       # Nouns in plural form
    #     'VBD': 'Past tense verb (e.g., ran)',     # Verbs in past tense
    #     'VBG': 'Gerund/present participle verb (e.g., running)',  # Verb form used for continuous actions
    #     'VBN': 'Past participle verb (e.g., eaten)',  # Verb form used in perfect tenses
    #     'VBP': 'Non-3rd person singular present verb (e.g., eat)',  # Present tense verb (except for third-person singular)
    #     'VBZ': '3rd person singular present verb (e.g., eats)'  # Present tense verb for third-person singular subjects

    filtered_tokens = [
        token.lower() for token, tag in tagged_tokens
        if tag[:2] in useful_pos_tags
    ]

    return filtered_tokens

# Function to lemmatize a single document
def lemmatize_text(text):
    tokens = remove_unwanted_tokens_with_pos(text)  # Clean and filter tokens
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]  # Lemmatize each token
    return " ".join(lemmatized_tokens)  # Return the lemmatized text as a string

# Function to process a list of texts in parallel
def parallel_lemmatization(texts):
    with ProcessPoolExecutor() as executor:
        results = list(executor.map(lemmatize_text, texts))
    return results

# Apply the parallel lemmatization to the 'cleaned_reviewText_no_stopwords' column
texts_to_process = RT_df['cleaned_reviewText_no_stopwords'].astype(str).tolist()  # Convert to list of strings

lemmatized_reviews = parallel_lemmatization(texts_to_process)

# Add the lemmatized text back to the DataFrame
RT_df['lemmatized_reviewText'] = lemmatized_reviews

# Optionally, inspect the result
print(RT_df[['cleaned_reviewText_no_stopwords', 'lemmatized_reviewText']].head())


In [None]:
# Generate bigrams for the cleaned review text
def get_top_bigrams(text_series, top_n=10):
    # Generate bigrams for the cleaned review text
    bigrams = []
    for review in text_series:
        tokens = word_tokenize(review)
        review_bigrams = list(ngrams(tokens, 2))  # Generate bigrams
        bigrams.extend(review_bigrams)  # Add them to the list of all bigrams

    # Count frequency of each bigram
    bigram_counts = Counter(bigrams)

    # Get the top N most common bigrams
    top_bigrams = bigram_counts.most_common(top_n)

    return top_bigrams

top_10_bigrams = get_top_bigrams(RT_df['lemmatized_reviewText'], top_n=10)

# Print the top 10 bigrams
print("Top 10 Most Common Bigrams:")
for bigram, count in top_10_bigrams:
    print(f"Bigram: {bigram}, Count: {count}")

### Top Bi-Gram

In [None]:
# Getting the top bi-gram
top_bigram, top_bigram_count = top_10_bigrams[0]
print(f"Top Bi-Gram: {top_bigram}, Count: {top_bigram_count}")

3. Normalize the `originalScore` column to prepare it for downstream regression modeling.

    One option is the following, though you are free to use any other normalization procedure.

    * Limit the data to only those `originalScore`s with the format `x/y` where `y > x`.
    * Perform the division in the remaining strings to get a number between 0 and 1.

### Overview of Cleaning:
- Handle Case of Letter-plus and Letter-minus conversion
- Handle Case of Conversion of 'OF', 'OUT OF' and 'OUT' to a fraction
- Remove unwanted characters (?, ' for now)
- Remove 'STARS' in prep for other transformations
- Handle European decimal Formatting ',' to '.'; discovered '.' to '/' typo during this
- Convert '\' to '/' which can be a common issue between OS
- Map typed letters to numbers

### Transformation Design:
- Handle raw numeric conversion. Assume 0<=4 is 0-4 scale, 4<=10 is 0-10 scale and 10<=100 0-100 scale. (0-5 not accounted)
- Do a fraction conversion with error catching
- Do a 0-4.0 letter grade GPA transformation

In [None]:
# NOTE: We assume a 5 is on a 0-10 scale in this code, 0-5 scale is nor currently supported

# Word-Grade mapping
word_to_number = {
    'ZERO': 0, 'ONE': 1, 'TWO': 2, 'THREE': 3, 'FOUR': 4, 'FIVE': 5,
    'SIX': 6, 'SEVEN': 7, 'EIGHT': 8, 'NINE': 9, 'TEN': 10
}

# Grade mapping (normalized scores)
grade_mapping = {
    'A+': 4.0, 'A': 4.0, 'A-': 3.7,
    'B+': 3.3, 'B': 3.0, 'B-': 2.7,
    'C+': 2.3, 'C': 2.0, 'C-': 1.7,
    'D+': 1.3, 'D': 1.0, 'D-': 0.7,
    'F': 0.0
}

def normalize_score(score):
    test = 0
    try:
        # Step 1: Convert score to string, strip whitespace and handle uppercase
        score = str(score).upper().strip()
        # Ensure regex is all capital

        # Step 2: Convert 'C-plus' to 'C+' and 'C-minus' to 'C-', 'B-plus' to 'B+', etc.
        score = re.sub(r'([A-F])-PLUS', r'\1+', re.sub(r'([A-F])-MINUS', r'\1-', score))

        # Step 3: Convert 'X of Y' or 'X out of Y' to 'X/Y'
        score = re.sub(r'\s*(OF|OUT OF|OUT)\s*', '/', score)

        # Step 4: Remove unwanted characters (e.g., '?')
        score = re.sub(r'[?]', '', score)  # Remove '?' characters

        # Step 5: Remove 'stars' first, before processing
        score = re.sub(r'\s*STARS?\s*', '', score)  # Remove any spaces around 'stars'

        # Step 6: Convert commas to decimals (e.g., '1,5' to '1.5')
        score = score.replace(',', '.')

        # Step 7: Handle 'backslash' case (convert '\' to '/')
        score = score.replace('\\', '/')

        # Step 8: Handle '4.5.5' case (convert second dot to '/')
        if re.match(r'\d+\.\d+\.\d+', score):  # e.g., '4.5.5'
            score = score.replace('.', '/', 1)  # Replace first dot with '/'

        # Step 9: Remove unnecessary whitespace
        score = score.replace(' ', '')   # Remove spaces completely

        # Step 10: Remove unwanted apostrophes (e.g., '7/1\' to '7/1')
        score = score.replace("'", "")

        # Step 11: Handle written-out numbers (e.g., "three", "three stars")
        score_words = score.split()
        for i, word in enumerate(score_words):
            word_lower = word.lower()
            if word_lower in word_to_number:
                score_words[i] = str(word_to_number[word_lower])

        # Rejoin the words back into a single string
        score = ' '.join(score_words)

        # Step 12: Handle string numeric values like '9.0', '4' or '3.5'
        if isinstance(score, str):
            try:
                score_value = float(score)  # Try to convert
                if 0 <= score_value <= 4:
                    return score_value / 4.0  # Normalize out of 4
                elif 4 < score_value <= 10:
                    return score_value / 10.0  # Normalize out of 10
                elif 10 < score_value <= 100:
                    return score_value / 100.0  # Normalize out of 100
                else:
                    pass
            except ValueError:
                pass  # If conversion fails, proceed with other checks

        # Step 13: Handle #/# format (e.g., '8/10' or '3/4')
        if isinstance(score, str) and '/' in score:
            numerator, denominator = score.split('/')
            try:
                numerator = float(numerator.strip())
                denominator = float(denominator.strip())
                if denominator > 0:
                    return numerator / denominator  # Normalize by dividing
                else:
                    pass
            except ValueError:
                pass

        # Step 14: Handle GPA grades like 'A+', 'B-', etc.
        elif isinstance(score, str):
            # Handle special cases: 'F-' should be considered as 'F'
            if score.startswith('F'):
                score = 'F'

            # Handle valid GPA grades
            if score in grade_mapping:
                value = grade_mapping[score]
                return value / 4.0  # Normalize out of 4
            else:
                return None

        return None

    except Exception as e:
        print(f"Error: {e} for score: {score}")
        return None

# Apply the normalization
RT_df['normalizedScore'] = RT_df['originalScore'].apply(normalize_score)

# Number of NA
na_count = RT_df['normalizedScore'].isna().sum()
print(f"\nNumber of NA values in 'normalizedScore': {na_count}")

print(RT_df[['originalScore', 'normalizedScore']].head())

### HTML Scroller for Normalization Edge Cases Not Caught

In [None]:
# Filter rows where normalizedScore is NaN but originalScore is not
filtered_df = RT_df[RT_df['normalizedScore'].isna() & RT_df['originalScore'].notna()]

# Format the normalizedScore to 3 decimal places
filtered_df['normalizedScore'] = filtered_df['normalizedScore'].apply(lambda x: f"{x:.3f}" if pd.notna(x) else 'NaN')
html_table = filtered_df[['originalScore', 'normalizedScore']].to_html(index=False)

# Create a scrollable HTML div
scrollable_html = f"""
<div style="height: 300px; overflow-y: scroll;">
    {html_table}
</div>
"""

# Display the scrollable table
display(HTML(scrollable_html))

### HTML Scroller for Difficult Normalization String Cases

In [None]:
# Filter rows where normalizedScore is NaN but originalScore is not NaN and originalScore doesn't contain '/'
filtered_df = RT_df[RT_df['normalizedScore'].notna() & RT_df['originalScore'].notna() & ~RT_df['originalScore'].str.contains('/', na=False)]

# Format the normalizedScore to 3 decimal places
filtered_df['normalizedScore'] = filtered_df['normalizedScore'].apply(lambda x: f"{x:.3f}" if pd.notna(x) else 'NaN')

# Convert the filtered dataframe to HTML
html_table = filtered_df[['originalScore', 'normalizedScore']].to_html(index=False)

# Create a scrollable HTML div
scrollable_html = f"""
<div style="height: 300px; overflow-y: scroll;">
    {html_table}
</div>
"""

# Display the scrollable table
display(HTML(scrollable_html))

In [None]:
# Spot-check of C-plus case (we want C+ not C- conversion)
# Filter rows where originalScore is "C-plus"
filtered_df = RT_df[RT_df['originalScore'].str.upper() == 'C-PLUS']

# Display the filtered DataFrame with 'originalScore' and 'normalizedScore'
filtered_df[['originalScore', 'normalizedScore']].head(5)

## Modeling

1) Using any model you're comfortable with, train a sentiment classification model with `reviewText` as the basis for the features, and `scoreSentiment` as the outcome. Report out on the model's performance.  This *does not* need to be an exhaustive hyperparameter tuning job.

## FastText AI model (lightweight)

### Overview
**FastText** is a fast and efficient text representation model developed by Facebook AI. It improves upon Word2Vec by representing words as **character n-grams**, which helps handle rare, misspelled, or out-of-vocabulary words. FastText is optimized for quick training and large datasets, making it ideal for real-time NLP tasks.

### Key Features:
- **Character-level n-grams**: Better handles rare and unseen words. We train on Word n-grams instead.
- **Fast and Efficient**: Scalable for large datasets with minimal memory usage.
- **Text Classification**: Supports sentiment analysis, spam detection, and more.

### Use Cases:
1. **Text Classification**: Ideal for sentiment analysis and topic categorization.
2. **Word Embeddings**: Generates high-quality embeddings for downstream NLP tasks.
3. **Unsupervised Learning**: Works with clustering or sentiment lexicons for unsupervised sentiment analysis.

In [None]:
# Prepare the data for FastText
# Converting back to 0/1 binary for classification
RT_df['scoreSentimentBinary'] = RT_df['scoreSentimentNumeric'].apply(lambda x: 1 if x == 1 else 0)

# FastText requires each text to be labeled with '__label__<label>'
train_df, test_df = train_test_split(RT_df, test_size=0.2, random_state=42)

def prepare_fasttext_input(df, text_column, label_column):
    return df.apply(lambda x: f"__label__{x[label_column]} {x[text_column]}", axis=1).tolist()

train_data = prepare_fasttext_input(train_df, 'lemmatized_reviewText', 'scoreSentimentBinary')
test_data = prepare_fasttext_input(test_df, 'lemmatized_reviewText', 'scoreSentimentBinary')

# Save to text files (FastText requires these files for training and testing)
with open('train.txt', 'w') as f:
    f.write("\n".join(train_data))

with open('test.txt', 'w') as f:
    f.write("\n".join(test_data))

# Define hyperparameters for optimization
lr_values = [0.05]
epoch_values = [50]
wordngrams_values = [2,3,5,6]
bucket_value = 200000

# Function to evaluate the model on the test set using AUC
def evaluate_model(model, test_df):
    y_true = test_df['scoreSentimentBinary']
    y_pred_prob = [model.predict(text)[1][0] for text in test_df['lemmatized_reviewText']]  # Get predicted probabilities

    # Calculate AUC score
    auc_score = roc_auc_score(y_true, y_pred_prob)

    return auc_score

# Grid Search for Hyperparameters
best_auc = 0
best_model = None
best_params = {}

for lr in lr_values:
    for epoch in epoch_values:
        for wordngrams in wordngrams_values:
              # Train FastText model with the current set of hyperparameters
              model = fasttext.train_supervised('train.txt', lr=lr, epoch=epoch, wordNgrams=wordngrams, bucket=bucket_value)

              # Evaluate model performance using AUC
              auc_score = evaluate_model(model, test_df)

              # If this is the best AUC score, save the model and parameters
              if auc_score > best_auc:
                  best_auc = auc_score
                  best_model = model
                  best_params = {'lr': lr, 'epoch': epoch, 'wordNgrams': wordngrams, 'bucket': bucket_value}

print("Best Hyperparameters:", best_params)
print("Best AUC Score on Test Set:", best_auc)

# Optionally, save the best model for later use
best_model.save_model("best_fasttext_model.bin")

Model shows more room to grow in it's hyperparameter optimization: What is the optimal word n-gram?

Chose AUC as optimization metric as determining Beta of F1 can be difficult. Training on F1 was not successful resulting in a redundant threshold of .5 and 294676 : 3 Positive : Negative Sentiment Classification.

Perform threshold optimization with F0.5, F1, and F2 instead.

In [None]:
# best_model = fasttext.load_model("best_fasttext_model.bin")

def plot_pr_curve(precision, recall, thresholds):
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='b', label='PR Curve')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='best')
    plt.grid(True)
    plt.show()

# Function to calculate Precision, Recall, F0.5, F1, and F2 at a threshold of 0.5
def evaluate_at_threshold(model, test_file, threshold=0.5):
    y_true = []
    y_pred = []
    y_scores = []

    # Read the test file and make predictions
    with open(test_file, 'r') as f:
        for line in f:
            true_label = int(line.split()[0].replace('__label__', ''))
            text = " ".join(line.split()[1:])

            # Predict probabilities
            label, score = model.predict(text, k=-1)  # Get all labels and scores
            predicted_score = score[0]  # The confidence score for the predicted label

            # Append true labels and predicted scores
            y_true.append(true_label)
            y_scores.append(predicted_score)

            # Convert to binary prediction based on threshold
            predicted_label = 1 if predicted_score >= threshold else 0
            y_pred.append(predicted_label)

    # Calculate precision, recall, F0.5 score, F1 score, and F2 score at threshold 0.5
    precision = precision_score(y_true, y_pred, pos_label=1)
    recall = recall_score(y_true, y_pred, pos_label=1)
    f0_5 = fbeta_score(y_true, y_pred, beta=0.5, pos_label=1)  # F0.5 score
    f1 = sklearn_f1_score(y_true, y_pred, pos_label=1)
    f2 = fbeta_score(y_true, y_pred, beta=2, pos_label=1)  # F2 score

    # Calculate AUC score
    auc = roc_auc_score(y_true, y_scores)

    print(f"Precision at threshold {threshold}: {precision:.4f}")
    print(f"Recall at threshold {threshold}: {recall:.4f}")
    print(f"F0.5 Score at threshold {threshold}: {f0_5:.4f}")
    print(f"F1 Score at threshold {threshold}: {f1:.4f}")
    print(f"F2 Score at threshold {threshold}: {f2:.4f}")
    print(f"AUC Score at threshold {threshold}: {auc:.4f}")

    return y_true, y_pred, y_scores

# Function for F0.5, F1, and F2 Threshold Optimization using sklearn
def optimize_thresholds(model, test_file):
    print("Evaluating at threshold 0.5:")
    y_true, y_pred, y_scores = evaluate_at_threshold(model, test_file, threshold=0.5)

    # PR curve for F1 score optimization
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    f1_scores = 2 * (precision * recall) / (precision + recall)

    # Calculate best F1 threshold
    best_f1_threshold = thresholds[np.argmax(f1_scores)]
    best_f1 = f1_scores[np.argmax(f1_scores)]

    # Calculate best F0.5 threshold (more weight on Precision)
    f0_5_scores = (1 + 0.5**2) * (precision * recall) / ((0.5**2) * precision + recall)
    best_f0_5_threshold = thresholds[np.argmax(f0_5_scores)]
    best_f0_5 = f0_5_scores[np.argmax(f0_5_scores)]

    # Calculate best F2 threshold (more weight on Recall)
    f2_scores = 2 * (precision * recall) / ((1 + 2**2) * (precision / 2 + recall))
    best_f2_threshold = thresholds[np.argmax(f2_scores)]
    best_f2 = f2_scores[np.argmax(f2_scores)]

    # Re-evaluate at the best thresholds
    print("\nAfter threshold optimization:")

    print("\nF1 Threshold Optimization:")
    _, _, _ = evaluate_at_threshold(model, test_file, threshold=best_f1_threshold)

    print("\nF0.5 Threshold Optimization:")
    _, _, _ = evaluate_at_threshold(model, test_file, threshold=best_f0_5_threshold)

    print("\nF2 Threshold Optimization:")
    _, _, _ = evaluate_at_threshold(model, test_file, threshold=best_f2_threshold)

    print(f"\nOptimal threshold for F1 score: {best_f1_threshold:.4f}")
    print(f"Best F1 Score: {best_f1:.4f}")

    print(f"Optimal threshold for F0.5 score: {best_f0_5_threshold:.4f}")
    print(f"Best F0.5 Score: {best_f0_5:.4f}")

    print(f"Optimal threshold for F2 score: {best_f2_threshold:.4f}")
    print(f"Best F2 Score: {best_f2:.4f}")

    plot_pr_curve(precision, recall, thresholds)

    return best_f1_threshold, best_f1, best_f0_5_threshold, best_f0_5, best_f2_threshold, best_f2

# Evaluate and optimize F0.5, F1, and F2 scores
best_f1_threshold, best_f1, best_f0_5_threshold, best_f0_5, best_f2_threshold, best_f2 = optimize_thresholds(best_model, 'test.txt')


In [None]:
# Change this to appropriate F1 metric
# Choosing F0.5 to preserve minority negative sentiment class
best_threshold = best_f0_5_threshold

# Function to get predictions for the entire dataset
def predict_and_add_to_df(model, RT_df, threshold=0.5):
    predictions = []

    for text in RT_df['lemmatized_reviewText']:
        # Predict label probabilities
        label, score = model.predict(text, k=-1)
        predicted_score = score[0]  # The confidence score for the predicted label

        # Convert to binary prediction
        predicted_label = 1 if predicted_score >= threshold else -1
        predictions.append(predicted_label)

    # Add the predictions as a new column to the DataFrame
    RT_df['predicted_sentiment'] = predictions
    return RT_df

# Assuming `model` is the trained FastText model and RT_df contains the necessary column
RT_df = predict_and_add_to_df(best_model, RT_df, threshold=best_threshold)

In [None]:
print("Predicted Sentiment Distribution:")
print(RT_df['predicted_sentiment'].value_counts())

Predicted Sentiment Distribution:
predicted_sentiment
 1    287250
-1      7429
Name: count, dtype: int64


## Conclusion
- Chose to optimize threshold on F0.5 to preserve minor class imbalance of Negative Sentiment
- Regardless of this optimization, AUC = .62 is weak. There is a lot of room for growth.
- Model is biased against negative sentiment

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
RT_df.to_pickle('/content/gdrive/My Drive/RT_df.pkl')

Mounted at /content/gdrive


(Not required) Anything else you'd like to show us with this dataset / modeling effort?

## Pretrained Transformer Sentiment Model (Hugging Face)
Note this model will take a while to perform inference.
https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment

I would also explore the following two Transformers using adversarial training methodologies on IMDB datasets, which makes the model more resilient to real-world movie review noise which we witnessed in the Score Transformation.
 - https://huggingface.co/textattack/roberta-base-imdb
 - https://huggingface.co/textattack/bert-base-uncased-imdb/discussions
 - https://www.labellerr.com/blog/what-are-adversarial-attacks-in-machine-learning-and-how-can-you-prevent-them/

<img src="https://www.labellerr.com/blog/content/images/2024/11/adversarial-attacks-machine-learning.webp" alt="Adversarial Attacks in Machine Learning" width="600"/>

In [None]:
# Bringing back in RT_df to notebook for Transformer
# NOTE: use GPU run-time
from google.colab import drive
drive.mount('/content/gdrive')

RT_df = pd.read_pickle('/content/gdrive/My Drive/RT_df.pkl')

Mounted at /content/gdrive


In [None]:
# Step 1: Loading in a pretrained multilingual transformer
print("Loading pretrained model and tokenizer...")
tokenizer = AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

# Check if GPU is available and move the model to GPU if it is
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)  # Move the model to GPU if available, otherwise CPU

# Set the model to evaluation mode (important for inference)
model.eval()
print(f"Model loaded: {model.config}")

# Step 2: Tokenize and predict sentiment function
def sentiment_score(review):
    try:
        # Tokenizing the review to GPU
        tokens = tokenizer.encode(review, return_tensors='pt').to(device)

        with torch.no_grad():  # No need to compute gradients for inference
            result = model(tokens)

        # Extract the logits and calculate probabilities using softmax
        probabilities = torch.nn.functional.softmax(result.logits, dim=-1)

        # Get the probability of the positive sentiment classes (sentiment 2, 3, and 4)
        # Class 2 is technically neutral
        # This step has to move tensors back into CPU, so process intensive
        positive_probabilities = probabilities.squeeze().cpu().numpy()[2:5].sum()

        # Get the highest logit (index of the most likely sentiment class)
        sentiment = int(torch.argmax(result.logits))

        probabilities_list = probabilities.squeeze().cpu().numpy().tolist()  # Convert tensor to list

        return sentiment, positive_probabilities, probabilities_list
    except Exception as e:
        print(f"Error during sentiment analysis: {e}")
        return None, None, None

# Step 3: Apply the sentiment_score function to the lemmatized review text
print("Applying sentiment analysis to reviews...")
RT_df[['transformer_sentiment', 'transformer_positive_probabilities', 'transformer_predicted_probabilities']] = RT_df['lemmatized_reviewText'].apply(
    lambda review: pd.Series(sentiment_score(review))
)

# Step 4: Convert scaled sentiment into binary outcome
def transform_predicted_sentiment(predicted):
    if predicted is None:  # Handle cases where the prediction might have failed
        return None
    elif predicted in [0, 1]:  # Sentiment 0 or 1 is negative
        return 0
    elif predicted in [3, 4]:  # Sentiment 3 or 4 is positive
        return 1
    else:  # output 2 (Neutral)
        return 1  # Treat Neutral (2) as positive

print("Transforming predicted sentiments to binary outcomes...")
RT_df['binary_transformer_sentiment'] = RT_df['transformer_sentiment'].apply(transform_predicted_sentiment)


Loading pretrained model and tokenizer...
Model loaded: BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "nlptown/bert-base-multilingual-uncased-sentiment",
  "_num_labels": 5,
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "finetuning_task": "sentiment-analysis",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "1 star",
    "1": "2 stars",
    "2": "3 stars",
    "3": "4 stars",
    "4": "5 stars"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "1 star": 0,
    "2 stars": 1,
    "3 stars": 2,
    "4 stars": 3,
    "5 stars": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_he

In [None]:
# Saving off outcomes for analysis
RT_df.to_pickle('/content/gdrive/My Drive/RT_transformer_df.pkl')

In [None]:
print("Transformer Predicted Sentiment Distribution:")
print(RT_df['binary_transformer_sentiment'].value_counts())
print(RT_df['transformer_sentiment'].value_counts())

Transformer Predicted Sentiment Distribution:
binary_transformer_sentiment
1    199348
0     95331
Name: count, dtype: int64
transformer_sentiment
4    95696
3    67127
0    65905
2    36525
1    29426
Name: count, dtype: int64


In [None]:
# Prepare the true labels and predicted labels
y_true = RT_df['scoreSentimentNumeric'].apply(lambda x: 1 if x == 1 else 0)  # Convert 1/-1 to 1/0 for AUC calculation
y_pred = RT_df['binary_transformer_sentiment']  # Predicted sentiments from transformer

# Calculate the probabilities for the positive class
y_prob = RT_df['transformer_positive_probabilities']  # Positive sentiment probabilities

# Precision and Recall (binary classification, pos_label=1 for positive sentiment)
precision = precision_score(y_true, y_pred, pos_label=1)
recall = recall_score(y_true, y_pred, pos_label=1)

# F1 Score (for positive class)
f1 = sklearn_f1_score(y_true, y_pred, pos_label=1)

# F0.5 Score
beta = 0.5
f05 = (1 + beta**2) * (precision * recall) / (beta**2 * precision + recall)

# AUC (Area Under the Curve)
auc = roc_auc_score(y_true, y_prob)

# Print all metrics
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"F0.5 Score: {f05:.4f}")
print(f"AUC Score: {auc:.4f}")

Precision: 0.8424
Recall: 0.7646
F1 Score: 0.8016
F0.5 Score: 0.8256
AUC Score: 0.7542


AUC Score: .7542