## Import Packages

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import fasttext

import warnings 
warnings.filterwarnings("ignore")

## Load Dataset
- Have done some preprocess on the Rotten_Tomatoes_Movies3 data for Traditional Machine Learning purpose
- Those clean data are used in this fasttext approch 

In [2]:
df_nlp = pd.read_excel("./df_copy_NlpTask.xlsx")
df_nlp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16386 entries, 0 to 16385
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   movie_title                  16386 non-null  object        
 1   movie_info                   16386 non-null  object        
 2   movie_info_processed         16386 non-null  object        
 3   movie_info_missing           16386 non-null  int64         
 4   critics_consensus            16386 non-null  object        
 5   critics_consensus_processed  16385 non-null  object        
 6   critics_consensus_missing    16386 non-null  int64         
 7   rating                       16386 non-null  object        
 8   genre                        16386 non-null  object        
 9   genre_missing                16386 non-null  int64         
 10  directors                    16386 non-null  object        
 11  directors_missing            16386 non-nu

## Data Exploration

In [3]:
print(f"tomatometer_status : {df_nlp['tomatometer_status'].unique()}\n{df_nlp['tomatometer_status'].value_counts()}")

tomatometer_status : ['Rotten' 'Certified Fresh' 'Fresh']
Rotten             7160
Fresh              6275
Certified Fresh    2951
Name: tomatometer_status, dtype: int64


In [4]:
print(f"audience_rating  : {df_nlp['audience_rating'].unique()}\n{df_nlp['audience_rating'].nunique()}\n{df_nlp['audience_rating'].value_counts()}")

audience_rating  : [ 53  64  97  74  37  86  78  87  38  66  79  84  35  57  82  80  89  75
  60  91  63  71  61  30  48  56  83  40  36  88  65  73  85  42  77  67
  94  20  68  58  55  70  93  31  29  24  33  43  59  69  39  52  45  22
  46  50  54  21  27  62  72  32  44  18  47  76  26  90  25  81  28  41
  19  49  11  34  92 100  12  51  95  96  17  23  16  14  15   9  10   5
   7  13   4   0  99   8   6  98]
98
72    322
75    314
78    296
80    289
76    287
     ... 
8       5
7       4
99      3
4       2
6       2
Name: audience_rating, Length: 98, dtype: int64


In [5]:
print(f"tomatometer_rating  : {df_nlp['tomatometer_rating'].unique()}\n{df_nlp['tomatometer_rating'].nunique()}\n{df_nlp['tomatometer_rating'].value_counts()}")

tomatometer_rating  : [ 49  86  68 100  89   8  96  20  90  80  91  25   4  69  67  92  75   0
  31  63  41  93  22  33  21  15  64  32  60  50  24  14  40  99  36  42
  27   9  84  97  87  81  78  54  35  82  58  18  55  88  45  53  13  73
  72  37  46   7  56  34  83  38  26  76  17  16  12  44  79  70  19  61
  62  74  71  29  77   5  51  28  23  10  48  11  95  94  65  30  98  52
  59  85  43  39  57  47  66   6   3   2   1]
101
100    860
50     389
80     388
83     351
67     345
      ... 
99      42
4       39
3       21
2       11
1        6
Name: tomatometer_rating, Length: 101, dtype: int64


In [6]:
print(df_nlp[['tomatometer_status', 'tomatometer_rating', 'audience_rating']])

      tomatometer_status  tomatometer_rating  audience_rating
0                 Rotten                  49               53
1        Certified Fresh                  86               64
2                  Fresh                  68               53
3        Certified Fresh                 100               97
4                  Fresh                  89               74
...                  ...                 ...              ...
16381             Rotten                  56               74
16382    Certified Fresh                  97               92
16383              Fresh                  78               87
16384              Fresh                  95               91
16385             Rotten                  57               62

[16386 rows x 3 columns]


In [7]:
df_nlp_1 = df_nlp.copy()
df_nlp_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16386 entries, 0 to 16385
Data columns (total 28 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   movie_title                  16386 non-null  object        
 1   movie_info                   16386 non-null  object        
 2   movie_info_processed         16386 non-null  object        
 3   movie_info_missing           16386 non-null  int64         
 4   critics_consensus            16386 non-null  object        
 5   critics_consensus_processed  16385 non-null  object        
 6   critics_consensus_missing    16386 non-null  int64         
 7   rating                       16386 non-null  object        
 8   genre                        16386 non-null  object        
 9   genre_missing                16386 non-null  int64         
 10  directors                    16386 non-null  object        
 11  directors_missing            16386 non-nu

## Preprocessing & Cleaning

In [8]:
import spacy
nlp = spacy.load("en_core_web_md") 

def preprocess(text):      # This defines a function called preprocess that cleans and normalizes the text
    doc = nlp(text)        # Passes the text to the SpaCy model (nlp) to get a linguistic analysis.
    filtered_word = []     
    for token in doc:
        if not token.is_stop and not token.is_punct and token.is_alpha:   # Filters out stop words (common words like "the," "a," "is"), punctuation, and non-alphabetic characters.
            filtered_word.append(token.lemma_.lower())                    # Adds the lemma (base form) of the token in lowercase to the filtered_word list.
    return " ".join(filtered_word)                                        # Returns the cleaned and normalized text as a single string.

In [9]:
sample_test = df_nlp_1['movie_info'][3]
sample_test

'A Puerto Rican youth is on trial for murder, accused of knifing his father to death. The twelve jurors retire to the jury room, having been admonished that the defendant is innocent until proven guilty beyond a reasonable doubt. Eleven of the jurors vote for conviction, each for reasons of his own. The sole holdout is Juror #8, played by Henry Fonda. As Fonda persuades the weary jurors to re-examine the evidence, we learn the backstory of each man. Juror #3 (Lee J. Cobb), a bullying self-made man, has estranged himself from his own son. Juror #7 (Jack Warden) has an ingrained mistrust of foreigners; so, to a lesser extent, does Juror #6 (Edward Binns). Jurors #10 (Ed Begley) and #11 (George Voskovec), so certain of the infallibility of the Law, assume that if the boy was arrested, he must be guilty. Juror #4 (E.G. Marshall) is an advocate of dispassionate deductive reasoning. Juror #5 (Jack Klugman), like the defendant a product of "the streets," hopes that his guilty vote will distan

In [10]:
preprocess(sample_test)

'puerto rican youth trial murder accuse knife father death juror retire jury room having admonish defendant innocent prove guilty reasonable doubt juror vote conviction reason sole holdout juror play henry fonda fonda persuade weary juror examine evidence learn backstory man juror lee cobb bully self man estrange son juror jack warden ingrained mistrust foreigner less extent juror edward binns juror ed begley george voskovec certain infallibility law assume boy arrest guilty juror marshall advocate dispassionate deductive reasoning juror jack klugman like defendant product street hope guilty vote distance past juror robert webber advertising man understand package market juror martin balsam john fiedler joseph sweeney anxious wave flow excruciatingly hot day drag hot night fonda chip away guilty verdict insist fellow juror bear mind word reasonable doubt pet project henry fonda angry men foray film production actor partner venture reginald rose write television play film base carry tv 

In [11]:
sample_test_1 = df_nlp_1['critics_consensus'][66]
sample_test_1

'Unknown'

In [12]:
preprocess(sample_test_1)

'unknown'

In [13]:
sample_test_2 = df_nlp_1['critics_consensus'][16382]
sample_test_2

"The brilliantly well-rounded Zootopia offers a thoughtful, inclusive message that's as rich and timely as its sumptuously state-of-the-art animation -- all while remaining fast and funny enough to keep younger viewers entertained."

In [14]:
preprocess(sample_test_2)

'brilliantly rounded zootopia offer thoughtful inclusive message rich timely sumptuously state art animation remain fast funny young viewer entertain'

In [15]:
sample_test_3 = df_nlp_1['critics_consensus'][485]
sample_test_3

'100 Streets strands its talented cast - led by a clearly overqualified Idris Elba - in the midst of a well-meaning but fatally contrived drama.'

In [16]:
preprocess(sample_test_3)

'streets strand talented cast lead clearly overqualified idris elba midst mean fatally contrive drama'

In [17]:
df_nlp_1['critics_consensus_processed'][16382]

'brilliantly round zootopia offer thoughtful inclusive message rich timely sumptuously state art animation remain fast funny young viewer entertain'

In [18]:
df_nlp_1['critics_consensus_processed'][485]

'100 streets strand talented cast lead clearly overqualifie idris elba midst mean fatally contrived drama'

In [19]:
# Add the processed columns
df_nlp_1['processed_movie_info'] = df_nlp_1['movie_info'].apply(preprocess)
df_nlp_1['processed_critics_consensus'] = df_nlp_1['critics_consensus'].apply(preprocess)

## Feature Engineering

In [20]:
# Reorder columns so the processed columns come right after the original columns
df_new = df_nlp_1.drop(['movie_info_missing', 'critics_consensus_missing',
                        'movie_info_processed', 'movie_info_missing',
                        'critics_consensus_processed', 'critics_consensus_missing', 
                        'rating', 'genre', 
                        'genre_missing', 'directors',
                        'directors_missing', 'writers',
                        'writers_missing', 'cast',
                        'cast_missing', 'in_theaters_date',
                        'in_theaters_date_missing', 'on_streaming_date',
                        'on_streaming_date_missing', 'runtime_in_minutes',
                        'runtime_in_minutes_missing', 'studio_name',
                        'studio_name_missing', 'tomatometer_rating',
                        'tomatometer_count'], axis=1)


df_new.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16386 entries, 0 to 16385
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   movie_title                  16386 non-null  object
 1   movie_info                   16386 non-null  object
 2   critics_consensus            16386 non-null  object
 3   tomatometer_status           16386 non-null  object
 4   audience_rating              16386 non-null  int64 
 5   processed_movie_info         16386 non-null  object
 6   processed_critics_consensus  16386 non-null  object
dtypes: int64(1), object(6)
memory usage: 896.2+ KB


### Final Sentiment Logic

In [21]:
# Define function to assign sentiment based on tomatometer_status
def get_tomatometer_sentiment(tomatometer_status):
    if tomatometer_status == 'Certified Fresh':
        return 'positive'
    elif tomatometer_status == 'Fresh':
        return 'positive'
    else:  # 'Rotten'
        return 'negative'

# Define function to assign sentiment based on audience_rating
def get_audience_sentiment(audience_rating):
    if audience_rating >= 60:
        return 'positive'
    elif 40 <= audience_rating < 60:
        return 'neutral'
    else:
        return 'negative'

# Function to combine both sentiments
def combine_sentiments(tomatometer_sentiment, audience_sentiment):
    if tomatometer_sentiment == 'positive' and audience_sentiment == 'positive':
        return 'positive'
    elif tomatometer_sentiment == 'negative' and audience_sentiment == 'negative':
        return 'negative'
    elif tomatometer_sentiment == 'positive' and audience_sentiment == 'neutral':
        return 'positive'
    elif tomatometer_sentiment == 'negative' and audience_sentiment == 'neutral':
        return 'negative'
    else: 
        return 'negative' # error check <-"neutral"

In [22]:
df_new['tomatometer_sentiment'] = df_new['tomatometer_status'].apply(get_tomatometer_sentiment)
df_new['audience_sentiment'] = df_new['audience_rating'].apply(get_audience_sentiment)

# Combine both sentiments to create final sentiment column
df_new['final_sentiment'] = df_new.apply(
    lambda row: combine_sentiments(row['tomatometer_sentiment'], row['audience_sentiment']), axis=1
)

In [26]:
df_new['final_sentiment'].value_counts()

positive    8841
negative    7545
Name: final_sentiment, dtype: int64

In [27]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16386 entries, 0 to 16385
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   movie_title                  16386 non-null  object
 1   movie_info                   16386 non-null  object
 2   critics_consensus            16386 non-null  object
 3   tomatometer_status           16386 non-null  object
 4   audience_rating              16386 non-null  int64 
 5   processed_movie_info         16386 non-null  object
 6   processed_critics_consensus  16386 non-null  object
 7   tomatometer_sentiment        16386 non-null  object
 8   audience_sentiment           16386 non-null  object
 9   final_sentiment              16386 non-null  object
dtypes: int64(1), object(9)
memory usage: 1.3+ MB


## Creates a new column "final_text" with the following format:
- "label" + sentiment + " " + preprocessed_review.
- This is the format FastText expects for supervised training.

In [28]:
df_new['final_text'] = "__label__"+ df_new['final_sentiment']+ " "+ df_new['processed_movie_info']+ " "+ df_new['processed_critics_consensus']
df_new['final_text'][3]

'__label__positive puerto rican youth trial murder accuse knife father death juror retire jury room having admonish defendant innocent prove guilty reasonable doubt juror vote conviction reason sole holdout juror play henry fonda fonda persuade weary juror examine evidence learn backstory man juror lee cobb bully self man estrange son juror jack warden ingrained mistrust foreigner less extent juror edward binns juror ed begley george voskovec certain infallibility law assume boy arrest guilty juror marshall advocate dispassionate deductive reasoning juror jack klugman like defendant product street hope guilty vote distance past juror robert webber advertising man understand package market juror martin balsam john fiedler joseph sweeney anxious wave flow excruciatingly hot day drag hot night fonda chip away guilty verdict insist fellow juror bear mind word reasonable doubt pet project henry fonda angry men foray film production actor partner venture reginald rose write television play f

## Rearrange the Features and Target 

In [29]:
columns_order = [
    'movie_title', 'movie_info', 'processed_movie_info',
    'critics_consensus', 'processed_critics_consensus',
    'tomatometer_status', 'tomatometer_sentiment',
    'audience_rating', 'audience_sentiment', 'final_sentiment', 
    'final_text'     
]

# Reorder the dataframe columns
df_new = df_new[columns_order]

In [30]:
df_new.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16386 entries, 0 to 16385
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   movie_title                  16386 non-null  object
 1   movie_info                   16386 non-null  object
 2   processed_movie_info         16386 non-null  object
 3   critics_consensus            16386 non-null  object
 4   processed_critics_consensus  16386 non-null  object
 5   tomatometer_status           16386 non-null  object
 6   tomatometer_sentiment        16386 non-null  object
 7   audience_rating              16386 non-null  int64 
 8   audience_sentiment           16386 non-null  object
 9   final_sentiment              16386 non-null  object
 10  final_text                   16386 non-null  object
dtypes: int64(1), object(10)
memory usage: 1.4+ MB


In [31]:
df_new.head()

Unnamed: 0,movie_title,movie_info,processed_movie_info,critics_consensus,processed_critics_consensus,tomatometer_status,tomatometer_sentiment,audience_rating,audience_sentiment,final_sentiment,final_text
0,Percy Jackson & the Olympians: The Lightning T...,A teenager discovers he's the descendant of a ...,teenager discover descendant greek god set adv...,Though it may seem like just another Harry Pot...,like harry potter knockoff percy jackson benef...,Rotten,negative,53,neutral,negative,__label__negative teenager discover descendant...
1,Please Give,Kate has a lot on her mind. There's the ethics...,kate lot mind ethic problem buy furniture chea...,Nicole Holofcener's newest might seem slight i...,nicole holofcener new slight place rendering c...,Certified Fresh,positive,64,positive,positive,__label__positive kate lot mind ethic problem ...
2,10,Blake Edwards' 10 stars Dudley Moore as George...,blake edwards star dudley moore george mancini...,Unknown,unknown,Fresh,positive,53,neutral,positive,__label__positive blake edwards star dudley mo...
3,12 Angry Men (Twelve Angry Men),"A Puerto Rican youth is on trial for murder, a...",puerto rican youth trial murder accuse knife f...,Sidney Lumet's feature debut is a superbly wri...,sidney lumet feature debut superbly write dram...,Certified Fresh,positive,97,positive,positive,__label__positive puerto rican youth trial mur...
4,"20,000 Leagues Under The Sea","This 1954 Disney version of Jules Verne's 20,0...",disney version jules verne leagues sea represe...,"One of Disney's finest live-action adventures,...",disney fine live action adventure leagues sea ...,Fresh,positive,74,positive,positive,__label__positive disney version jules verne l...


## Export Preprocessed Data 
- next we will split the data for Training the Fasttext model.

In [32]:
def export_excel(data_set, dataset_name):
    data_set.to_excel(dataset_name, index=False)
    print(f"Dataset successfully saved as {dataset_name}")

In [33]:
#export_excel(df_new, "Nlpt_task_fasttext.xlsx")

Dataset successfully saved as Nlpt_task_fasttext.xlsx
