# Exploratory Data Analysis

Questions

- How many Oscar nominations are there in this dataset? Oscar wins?
- Mean and std dev of tokens in the scripts?
- Mean and std dev of tokens in the summary?

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from datasets import load_dataset

import scipy
import sklearn 
import statsmodels

import os

In [2]:
import tiktoken

In [3]:
tokenizer = tiktoken.get_encoding("gpt2")

## Load Data

In [None]:
processed_dir = os.path.join('..','data', 'processed')

df_train = pd.read_parquet(os.path.join(processed_dir,'train_clean.parquet'))
df_val = pd.read_parquet(os.path.join(processed_dir,'val_clean.parquet'))
df_test = pd.read_parquet(os.path.join(processed_dir,'test_clean.parquet'))

In [5]:
df_train.head()

Unnamed: 0,movie_name,imdb_id,title,year,summary,script,script_plain,script_clean,nominated,winner
0,Above the Law_1988,tt0094602,Above the Law,1988,"Sergeant Nico Toscani, a native of Palermo, Si...",<script>\n <scene>\n <stage_direction>ABOV...,\n \n ABOVE THE LAW \n TITLES SEQUE...,ABOVE THE LAW\nTITLES SEQUENCE - MONTAGE WITH ...,0,0
1,Fracture_2007,tt0488120,Fracture,2007,"Theodore ""Ted"" Crawford (Anthony Hopkins), a w...",<script>\n <scene>\n <stage_direction>FRAC...,\n \n FRACTURE \n CREDITS SEQUENCE ...,FRACTURE\nCREDITS SEQUENCE : EXTREME CLOSE - U...,0,0
2,She Said_2022,tt11198810,She Said,2022,"In 2017, New York Times reporter Jodi Kantor r...",<script>\n <scene>\n <character>SHE SAID</...,\n \n SHE SAID \n Screenplay by \n ...,SHE SAID\nScreenplay by\nRebecca Lenkiewicz Ba...,0,0
3,Unbroken_2014,tt1809398,Unbroken,2014,During an April 1943 bombing mission against t...,<script>\n <scene>\n <character>UNBROKEN</...,\n \n UNBROKEN \n Screenplay by \n ...,UNBROKEN\nScreenplay by\nJoel Coen &amp; Ethan...,0,0
4,The Bonfire of the Vanities_1990,tt0099165,The Bonfire of the Vanities,1990,Sherman McCoy is a Wall Street bond trader who...,<script>\n <scene>\n <stage_direction>EXT....,\n \n EXT. MANHATTAN SKYLINE - NIGHT \n...,EXT. MANHATTAN SKYLINE - NIGHT\nMOVING IN FAST...,0,0


In [6]:
df_names = ['train', 'val', 'test']

In [7]:
dfs = [df_train, df_val, df_test]

In [8]:
len(df_train),len(df_val),len(df_test)

(1320, 440, 440)

In [9]:
csv_df = pd.read_csv('./oscar_data/oscars.csv',sep='	')
csv_df.head()

Unnamed: 0,Ceremony,Year,Class,CanonicalCategory,Category,Film,FilmId,Name,Nominees,NomineeIds,Winner,Detail,Note,Citation
0,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,The Noose|The Patent Leather Kid,tt0019217|tt0018253,Richard Barthelmess,Richard Barthelmess,nm0001932,,Nickie Elkins|The Patent Leather Kid,,
1,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,The Last Command|The Way of All Flesh,tt0019071|tt0019553,Emil Jannings,Emil Jannings,nm0417837,True,General Dolgorucki [Grand Duke Sergius Alexand...,,
2,1,1927/28,Acting,ACTRESS IN A LEADING ROLE,ACTRESS,A Ship Comes In,tt0018389,Louise Dresser,Louise Dresser,nm0237571,,Mrs. Pleznik,,
3,1,1927/28,Acting,ACTRESS IN A LEADING ROLE,ACTRESS,7th Heaven|Street Angel|Sunrise,tt0018379|tt0019429|tt0018455,Janet Gaynor,Janet Gaynor,nm0310980,True,Diane|Angela|The Wife,,
4,1,1927/28,Acting,ACTRESS IN A LEADING ROLE,ACTRESS,Sadie Thompson,tt0019344,Gloria Swanson,Gloria Swanson,nm0841797,,Sadie Thompson,,


## `oscars.csv` 

In [10]:
csv_df['Class'].unique()

array(['Acting', 'Production', 'Directing', 'Title', 'Writing', 'Special',
       'SciTech', 'Music'], dtype=object)

In [11]:
df_filter = csv_df['Class'] == 'Writing'

filter_df = csv_df[df_filter]
filter_df['Category'].unique()

list1 = list(filter_df['Category'].unique())

In [12]:
csv_df['Category'].unique()

array(['ACTOR', 'ACTRESS', 'ART DIRECTION', 'CINEMATOGRAPHY',
       'DIRECTING (Comedy Picture)', 'DIRECTING (Dramatic Picture)',
       'ENGINEERING EFFECTS', 'OUTSTANDING PICTURE',
       'UNIQUE AND ARTISTIC PICTURE', 'WRITING (Adaptation)',
       'WRITING (Original Story)', 'WRITING (Title Writing)',
       'SPECIAL AWARD', 'DIRECTING', 'WRITING', 'OUTSTANDING PRODUCTION',
       'SOUND RECORDING', 'SCIENTIFIC OR TECHNICAL AWARD (Class I)',
       'SCIENTIFIC OR TECHNICAL AWARD (Class II)',
       'SCIENTIFIC OR TECHNICAL AWARD (Class III)',
       'SHORT SUBJECT (Cartoon)', 'SHORT SUBJECT (Comedy)',
       'SHORT SUBJECT (Novelty)', 'ASSISTANT DIRECTOR', 'FILM EDITING',
       'MUSIC (Scoring)', 'MUSIC (Song)', 'DANCE DIRECTION',
       'WRITING (Screenplay)', 'ACTOR IN A SUPPORTING ROLE',
       'ACTRESS IN A SUPPORTING ROLE', 'SHORT SUBJECT (Color)',
       'SHORT SUBJECT (One-reel)', 'SHORT SUBJECT (Two-reel)',
       'IRVING G. THALBERG MEMORIAL AWARD', 'MUSIC (Original Scor

In [13]:
writing_categories = [a for a in csv_df['Category'].unique() if 'WRITING' in a]
writing_categories

['WRITING (Adaptation)',
 'WRITING (Original Story)',
 'WRITING (Title Writing)',
 'WRITING',
 'WRITING (Screenplay)',
 'WRITING (Original Screenplay)',
 'WRITING (Original Motion Picture Story)',
 'WRITING (Motion Picture Story)',
 'WRITING (Story and Screenplay)',
 'WRITING (Screenplay--Adapted)',
 'WRITING (Screenplay--Original)',
 'WRITING (Screenplay--based on material from another medium)',
 'WRITING (Story and Screenplay--written directly for the screen)',
 'WRITING (Story and Screenplay--based on material not previously published or produced)',
 'WRITING (Story and Screenplay--based on factual material or material not previously published or produced)',
 'WRITING (Screenplay Adapted from Other Material)',
 'WRITING (Screenplay Written Directly for the Screen--based on factual material or on story material not previously published or produced)',
 'WRITING (Screenplay Based on Material from Another Medium)',
 'WRITING (Screenplay Written Directly for the Screen)',
 'WRITING (Scre

In [14]:
len(list1)

21

In [15]:
len(writing_categories)

21

In [16]:
for item in writing_categories:
    if item not in list1:
        print(item)

In [17]:
df_filter = (csv_df['Class'] == 'Writing') & (csv_df['Winner'] == True)
oscar_wins_df = csv_df[df_filter]

In [18]:
oscar_wins_df.head()

Unnamed: 0,Ceremony,Year,Class,CanonicalCategory,Category,Film,FilmId,Name,Nominees,NomineeIds,Winner,Detail,Note,Citation
26,1,1927/28,Writing,WRITING (Adapted Screenplay),WRITING (Adaptation),7th Heaven,tt0018379,Benjamin Glazer,Benjamin Glazer,nm0322227,True,,,
28,1,1927/28,Writing,WRITING (Original Story),WRITING (Original Story),Underworld,tt0018526,Ben Hecht,Ben Hecht,nm0372942,True,,,
30,1,1927/28,Writing,WRITING (Title Writing),WRITING (Title Writing),,,Joseph Farnham,Joseph Farnham,nm0267868,True,,NOTE: This award was not associated with any s...,
69,2,1928/29,Writing,WRITING (Adapted Screenplay),WRITING,The Patriot,tt0019257,Hans Kraly,Hans Kraly,nm0473134,True,,NOTE: THIS IS NOT AN OFFICIAL NOMINATION. Ther...,
110,3,1929/30,Writing,WRITING (Adapted Screenplay),WRITING,The Big House,tt0020686,Frances Marion,Frances Marion,nm0547966,True,,,


In [19]:
oscar_wins_df[oscar_wins_df['FilmId'] == 'tt1285016']


Unnamed: 0,Ceremony,Year,Class,CanonicalCategory,Category,Film,FilmId,Name,Nominees,NomineeIds,Winner,Detail,Note,Citation
10055,83,2010,Writing,WRITING (Adapted Screenplay),WRITING (Adapted Screenplay),The Social Network,tt1285016,Screenplay by Aaron Sorkin,Aaron Sorkin,nm0815070,True,,,


## Oscar noms / wins

### Oscar nominations

In [20]:
df_train['nominated'].unique()

array([0, 1])

In [21]:
df_filter = df_train['nominated'] == 1
df_train[df_filter].head()

Unnamed: 0,movie_name,imdb_id,title,year,summary,script,script_plain,script_clean,nominated,winner
10,A Soldier's Story_1984,tt0088146,A Soldier's Story,1984,"In 1944 during World War II, Vernon Waters, a ...",<script>\n <scene>\n <character>A SOLDIER'...,\n \n A SOLDIER'S STORY \n Screenpl...,A SOLDIER'S STORY\nScreenplay\nby Charles Full...,1,0
15,The Naked City_1948,tt0040636,The Naked City,1948,In the late hours of a hot New York summer nig...,<script>\n <scene>\n <stage_direction>EXT....,\n \n EXT. LONG SHOT OF LOWER MANHATTAN...,EXT. LONG SHOT OF LOWER MANHATTAN A MOONLIT - ...,1,0
21,Star Wars: Episode IV - A New Hope_1977,tt0076759,Star Wars: Episode IV - A New Hope,1977,"Amid a galactic civil war, Rebel Alliance spie...",<script>\n <scene>\n <scene_description>EP...,\n \n EPISODE IV FROM THE \n JOURNA...,EPISODE IV FROM THE\nJOURNAL OF THE WHILLS\nWr...,1,0
40,American Splendor_2003,tt0305206,American Splendor,2003,The film opens in the year 1950. It's Hallowee...,<script>\n <scene>\n <stage_direction>INT....,\n \n INT. HARVEY’S BEDROOM - NIGHT \n ...,INT. HARVEY'S BEDROOM - NIGHT\nA BEDROOM MIRRO...,1,0
41,The Pianist_2002,tt0253474,The Pianist,2002,"In September 1939, Władysław Szpilman, a Polis...",<script>\n <scene>\n <stage_direction>INT....,\n \n INT. WARSAW (ARCHIVE) - DAY \n ...,INT. WARSAW (ARCHIVE) - DAY\nBlack and white ....,1,1


In [22]:
for name, df in zip(df_names, dfs):
    df_filter = df['nominated'] == 1
    pos_fraction = len(df[df_filter]) / len(df)
    print(f'{name}: {pos_fraction*100:.2f}% nominated for Oscar in best screenplay')

train: 18.94% nominated for Oscar in best screenplay
val: 19.09% nominated for Oscar in best screenplay
test: 18.86% nominated for Oscar in best screenplay


### Oscar wins

In [23]:
df_train['winner'].unique()

array([0, 1])

In [24]:
df_val['winner'].unique()

array([0, 1])

In [25]:
df_test['winner'].unique()

array([0, 1])

In [26]:
df_filter = df_test['title'] == 'The Social Network'
df_test[df_filter]

Unnamed: 0,movie_name,imdb_id,title,year,summary,script,script_plain,script_clean,nominated,winner
41,The Social Network_2010,tt1285016,The Social Network,2010,"On October 28, 2003, 19-year-old Harvard Unive...",<script>\n <scene>\n <scene_description>FR...,\n \n FROM THE BLACK WE HEAR-- \n M...,FROM THE BLACK WE HEAR--\nMARK (V.O.)\nDid you...,1,1


In [27]:
for name, df in zip(df_names, dfs):
    df_filter = df['winner'] == 1
    pos_fraction = len(df[df_filter]) / len(df)
    print(f'{name}: {pos_fraction*100:.2f}% won Oscar for best screenplay')

train: 4.39% won Oscar for best screenplay
val: 4.09% won Oscar for best screenplay
test: 5.00% won Oscar for best screenplay


## Summary Lengths

In [28]:
text = (df_train.iloc[0]['summary'])
integers = tokenizer.encode(text=text)

In [29]:
len(integers)

511

In [30]:
n_tokens = [[],[],[]]

for idx, df in enumerate(dfs):
    for i in range(len(df)):
        text = (df.iloc[i]['summary'])
        integers = tokenizer.encode(text=text)
        n_tokens[idx].append(len(integers))

global_n_tokens = n_tokens[0] + n_tokens[1] + n_tokens[2]
global_n_tokens = np.array(global_n_tokens)

In [31]:
for name, n in zip(df_names, n_tokens):
    n_array = np.array(n)
    print(f'{name} dataset: mean={np.mean(n_array):.1f}, min={np.min(n_array)}, max={np.max(n_array)}')

train dataset: mean=789.8, min=14, max=2282
val dataset: mean=795.3, min=33, max=1691
test dataset: mean=789.8, min=27, max=1723


In [32]:
print(f'global dataset: mean={np.mean(global_n_tokens):.1f}, min={np.min(global_n_tokens)}, max={np.max(global_n_tokens)}')

global dataset: mean=790.9, min=14, max=2282


## Script Lengths

In [33]:
text = (df_train.iloc[0]['script_clean'])
integers = tokenizer.encode(text=text)
len(integers)

41987

In [34]:
n_tokens = [[],[],[]]

for idx, df in enumerate(dfs):
    for i in range(len(df)):
        text = (df.iloc[i]['script_clean'])
        integers = tokenizer.encode(text=text)
        n_tokens[idx].append(len(integers))

global_n_tokens = n_tokens[0] + n_tokens[1] + n_tokens[2]
global_n_tokens = np.array(global_n_tokens)

In [35]:
for name, n in zip(df_names, n_tokens):
    n_array = np.array(n)
    print(f'{name} dataset: mean={np.mean(n_array):.1f}, min={np.min(n_array)}, max={np.max(n_array)}')

train dataset: mean=37082.5, min=7008, max=106578
val dataset: mean=37251.7, min=13063, max=72759
test dataset: mean=36871.5, min=11097, max=94792


In [36]:
print(f'global dataset: mean={np.mean(global_n_tokens):.1f}, min={np.min(global_n_tokens)}, max={np.max(global_n_tokens)}')

global dataset: mean=37074.1, min=7008, max=106578
