# Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
import spacy

nlp = spacy.load('en_core_web_sm')

## Load Datasets

In [2]:
scripts_df = pd.read_csv('Data/Raw/simpsons_script_lines.csv', dtype = 'unicode')
characters_df = pd.read_csv('Data/Raw/simpsons_characters.csv', dtype = 'unicode')
episods_df = pd.read_csv('Data/Raw/simpsons_episodes.csv', dtype = 'unicode')
locations_df = pd.read_csv('Data/Raw/simpsons_locations.csv', dtype = 'unicode')

scripts_df.head()

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464,3,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9,3,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3
2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464,3,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22
3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9,3,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5
4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40,3,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33


## Cleanup

### Drop Irrelevant Columns

In [3]:
# lcoation_df
locations_df.drop(columns = ['name'], inplace = True)

# episods_df
episods_df.drop(columns = ['image_url', 'original_air_year', 'video_url', 'production_code', 'views'], inplace = True)

# characters_df
characters_df.drop(columns = ['normalized_name', 'gender'], inplace = True)

# scripts_df
scripts_df.drop(columns = ['word_count', 'character_id', 'location_id', 'timestamp_in_ms', 'normalized_text', 'raw_text', 'number', 'id'], inplace = True)

### Fix Types

In [4]:
# location_df
locations_df['id'] = locations_df['id'].astype(int)

# episodes_df
episodes_types = {
    'id': int,
    'imdb_rating': float,
    'imdb_votes': float,
    'number_in_season': int,
    'number_in_series': int,
    'season': int,
    'us_viewers_in_millions': float,    
}
episods_df = episods_df.astype(episodes_types)
episods_df['original_air_date'] = pd.to_datetime(episods_df['original_air_date'], format = '%Y-%m-%d')

# characters_df
characters_df['id'] = characters_df['id'].astype(int)

# scripts_df
scripts_types = {
    'id': int,
    'episode_id': int,
    'character_id': int,
    'location_id': int,    
}
episods_df = episods_df.astype(episodes_types)

### More Filtering

In [5]:
# Rename columns of episods_df
episods_df.rename(columns = {'imdb_rating': 'imdb', 'original_air_date': 'datetime', 'us_viewers_in_millions': 'us_viewers'}, inplace = True)

# Convert 'us_viewers' from episods_df to original numeric format
episods_df['us_viewers'] = episods_df['us_viewers'].apply(lambda x: x * (10 ** 6))

# Drop unspokon scripts from scripts_df
scripts_df = scripts_df[scripts_df['speaking_line'] == 'true']
scripts_df.drop(columns = ['speaking_line'], inplace = True)
scripts_df.rename(columns = {'spoken_words': 'raw_text', 'raw_character_text': 'character', 'raw_location_text': 'location'}, inplace = True)

## Save Datasets

In [6]:
scripts_df.to_csv('Data/Processed/simpsons_cleaned_script_lines.csv', index = False)
episods_df.to_csv('Data/Processed/simpsons_cleaned_episodes.csv', index = False)