# Preprocessing

## Imports

In [67]:
import pandas as pd
import numpy as np
import spacy

nlp = spacy.load('en_core_web_sm')

## Load Datasets

In [68]:
scripts_df = pd.read_csv('Data/Raw/simpsons_script_lines.csv', dtype = 'unicode')
characters_df = pd.read_csv('Data/Raw/simpsons_characters.csv', dtype = 'unicode')
episods_df = pd.read_csv('Data/Raw/simpsons_episodes.csv', dtype = 'unicode')
locations_df = pd.read_csv('Data/Raw/simpsons_locations.csv', dtype = 'unicode')

## Preprocessing

### Drop Irrelevant Columns

In [69]:
# lcoation_df
locations_df.drop(columns = ['name'], inplace = True)

# episods_df
episods_df.drop(columns = ['image_url', 'original_air_year', 'video_url', 'production_code', 'views'], inplace = True)

# characters_df
characters_df.drop(columns = ['normalized_name', 'gender'], inplace = True)

# scripts_df
scripts_df.drop(columns = ['word_count', 'raw_character_text', 'raw_location_text', 'timestamp_in_ms', 'normalized_text', 'raw_text', 'number'], inplace = True)

### Fix Types

In [70]:
# location_df
locations_df['id'] = locations_df['id'].astype(int)

# episodes_df
episodes_types = {
    'id': int,
    'imdb_rating': float,
    'imdb_votes': float,
    'number_in_season': int,
    'number_in_series': int,
    'season': int,
    'us_viewers_in_millions': float,    
}
episods_df = episods_df.astype(episodes_types)
episods_df['original_air_date'] = pd.to_datetime(episods_df['original_air_date'], format = '%Y-%m-%d')

# characters_df
characters_df['id'] = characters_df['id'].astype(int)

# scripts_df
scripts_types = {
    'id': int,
    'episode_id': int,
    'character_id': int,
    'location_id': int,
    
}

### Rename Columns, Fix Formattings, Dropped Rows

In [71]:
# Rename columns
episods_df.rename(columns = {'imdb_rating': 'imdb', 'original_air_date': 'datetime', 'us_viewers_in_millions': 'us_viewers'}, inplace = True)

# Convert to original numeric format
episods_df['us_viewers'] = episods_df['us_viewers'].apply(lambda x: x * (10 ** 6))

In [72]:
scripts_df.head()

Unnamed: 0,id,episode_id,speaking_line,character_id,location_id,spoken_words
0,9549,32,True,464,3,"No, actually, it was a little of both. Sometim..."
1,9550,32,True,9,3,Where's Mr. Bergstrom?
2,9551,32,True,464,3,I don't know. Although I'd sure like to talk t...
3,9552,32,True,9,3,That life is worth living.
4,9553,32,True,40,3,The polls will be open from now until the end ...


In [78]:
scripts_df[scripts_df['speaking_line'] == 'false'].shape

(26158, 6)