# Exploratory Data Analysis

Questions

- How many Oscar nominations are there in this dataset? Oscar wins?
- Mean and std dev of tokens in the scripts?
- Mean and std dev of tokens in the summary?

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from datasets import load_dataset

import scipy
import sklearn 
import statsmodels

import os

In [None]:
import tiktoken

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

## Load Data

In [None]:
raw_dir = os.path.join('..','data', 'raw')
processed_dir = os.path.join('..','data', 'processed')

df_train = pd.read_parquet(os.path.join(processed_dir,'train_clean.parquet'))
df_val = pd.read_parquet(os.path.join(processed_dir,'val_clean.parquet'))
df_test = pd.read_parquet(os.path.join(processed_dir,'test_clean.parquet'))

In [None]:
df_train.head()

In [None]:
df_names = ['train', 'val', 'test']

In [None]:
dfs = [df_train, df_val, df_test]

In [None]:
len(df_train),len(df_val),len(df_test)

In [None]:
csv_df = pd.read_csv(os.path.join(raw_dir, 'oscar_data','oscars.csv'),sep='	')
csv_df.head()

## `oscars.csv` 

In [None]:
csv_df['Class'].unique()

In [None]:
df_filter = csv_df['Class'] == 'Writing'

filter_df = csv_df[df_filter]
filter_df['Category'].unique()

list1 = list(filter_df['Category'].unique())

In [None]:
csv_df['Category'].unique()

In [None]:
writing_categories = [a for a in csv_df['Category'].unique() if 'WRITING' in a]
writing_categories

In [None]:
len(list1)

In [None]:
len(writing_categories)

In [None]:
for item in writing_categories:
    if item not in list1:
        print(item)

In [None]:
df_filter = (csv_df['Class'] == 'Writing') & (csv_df['Winner'] == True)
oscar_wins_df = csv_df[df_filter]

In [None]:
oscar_wins_df.head()

In [None]:
oscar_wins_df[oscar_wins_df['FilmId'] == 'tt1285016']


## Oscar noms / wins

### Oscar nominations

In [None]:
df_train['nominated'].unique()

In [None]:
df_filter = df_train['nominated'] == 1
df_train[df_filter].head()

In [None]:
for name, df in zip(df_names, dfs):
    df_filter = df['nominated'] == 1
    pos_fraction = len(df[df_filter]) / len(df)
    print(f'{name}: {pos_fraction*100:.2f}% nominated for Oscar in best screenplay')

### Oscar wins

In [None]:
df_train['winner'].unique()

In [None]:
df_val['winner'].unique()

In [None]:
df_test['winner'].unique()

In [None]:
df_filter = df_test['title'] == 'The Social Network'
df_test[df_filter]

In [None]:
for name, df in zip(df_names, dfs):
    df_filter = df['winner'] == 1
    pos_fraction = len(df[df_filter]) / len(df)
    print(f'{name}: {pos_fraction*100:.2f}% won Oscar for best screenplay')

## Summary Lengths

In [None]:
text = (df_train.iloc[0]['summary'])
integers = tokenizer.encode(text=text)

In [None]:
len(integers)

In [None]:
n_tokens = [[],[],[]]

for idx, df in enumerate(dfs):
    for i in range(len(df)):
        text = (df.iloc[i]['summary'])
        integers = tokenizer.encode(text=text)
        n_tokens[idx].append(len(integers))

global_n_tokens = n_tokens[0] + n_tokens[1] + n_tokens[2]
global_n_tokens = np.array(global_n_tokens)

In [None]:
for name, n in zip(df_names, n_tokens):
    n_array = np.array(n)
    print(f'{name} dataset: mean={np.mean(n_array):.1f}, min={np.min(n_array)}, max={np.max(n_array)}')

In [None]:
print(f'global dataset: mean={np.mean(global_n_tokens):.1f}, min={np.min(global_n_tokens)}, max={np.max(global_n_tokens)}')

## Script Lengths

In [None]:
text = (df_train.iloc[0]['script_clean'])
integers = tokenizer.encode(text=text)
len(integers)

In [None]:
n_tokens = [[],[],[]]

for idx, df in enumerate(dfs):
    for i in range(len(df)):
        text = (df.iloc[i]['script_clean'])
        integers = tokenizer.encode(text=text)
        n_tokens[idx].append(len(integers))

global_n_tokens = n_tokens[0] + n_tokens[1] + n_tokens[2]
global_n_tokens = np.array(global_n_tokens)

In [None]:
for name, n in zip(df_names, n_tokens):
    n_array = np.array(n)
    print(f'{name} dataset: mean={np.mean(n_array):.1f}, min={np.min(n_array)}, max={np.max(n_array)}')

In [None]:
print(f'global dataset: mean={np.mean(global_n_tokens):.1f}, min={np.min(global_n_tokens)}, max={np.max(global_n_tokens)}')

In [None]:
text = (df_train.iloc[i]['script_clean'])
integers = tokenizer.encode(text=text)

print(integers[:10])