# Preprocessing

## Imports

In [1]:
import pandas as pd
import numpy as np
import re
import spacy

nlp = spacy.load('en_core_web_md')

## Load Datasets

In [2]:
scripts_df = pd.read_csv('Data/Raw/simpsons_script_lines.csv', dtype = 'unicode')
episods_df = pd.read_csv('Data/Raw/simpsons_episodes.csv', dtype = 'unicode')

scripts_df.head()

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464,3,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9,3,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3
2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464,3,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22
3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9,3,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5
4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40,3,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33


## Cleanup

### Drop Irrelevant Columns

In [3]:
# episods_df
episods_df.drop(columns = ['image_url', 'original_air_year', 'video_url', 'production_code', 'views'], inplace = True)

# scripts_df
scripts_df.drop(columns = ['character_id', 'location_id', 'timestamp_in_ms', 'normalized_text', 'raw_text', 'number', 'id'], inplace = True)

### Fix Types

In [4]:
# episodes_df
episodes_types = {
    'id': int,
    'imdb_rating': float,
    'imdb_votes': float,
    'number_in_season': int,
    'number_in_series': int,
    'season': int,
    'us_viewers_in_millions': float,    
}
episods_df = episods_df.astype(episodes_types)
episods_df['original_air_date'] = pd.to_datetime(episods_df['original_air_date'], format = '%Y-%m-%d')

# scripts_df
scripts_types = {
    'id': int,
    'episode_id': int,
    'character_id': int,
    'location_id': int,    
}
episods_df = episods_df.astype(episodes_types)

### More Filtering

In [5]:
# Rename columns of episods_df
episods_df.drop(columns = ['id'], inplace = True)
episods_df.rename(columns = {'imdb_rating': 'imdb', 'original_air_date': 'datetime', 'us_viewers_in_millions': 'us_viewers'}, inplace = True)

# Convert 'us_viewers' from episods_df to original numeric format
episods_df['us_viewers'] = episods_df['us_viewers'].apply(lambda x: x * (10 ** 6))

# Drop unspokon scripts from scripts_df
scripts_df = scripts_df[scripts_df['speaking_line'] == 'true']
scripts_df.drop(columns = ['speaking_line'], inplace = True)
scripts_df.rename(columns = {'spoken_words': 'raw_text', 'raw_character_text': 'character', 'raw_location_text': 'location'}, inplace = True)

## Text Processing

In [30]:
def normalize_text(text):
    return ' '.join([token.lemma_ for token in nlp(text.lower()) if (not token.is_stop and not token.is_punct)])    

scripts_df['normalized_text'] = scripts_df['raw_text'].apply(normalize_text)

In [31]:
scripts_df

Unnamed: 0,episode_id,character,location,raw_text,word_count,nomalized_text
0,32,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",31,actually little disease magazine news show nat...
1,32,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,3,mr bergstrom
2,32,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,22,know sure like talk touch lesson plan teach
3,32,Lisa Simpson,Springfield Elementary School,That life is worth living.,5,life worth live
4,32,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,33,poll open end recess case decide thought final...
...,...,...,...,...,...,...
158266,32,Miss Hoover,Springfield Elementary School,I'm back.,2,
158267,32,Miss Hoover,Springfield Elementary School,"You see, class, my Lyme disease turned out to ...",10,class lyme disease turn
158268,32,Miss Hoover,Springfield Elementary School,Psy-cho-so-ma-tic.,1,psy cho ma tic
158269,32,Ralph Wiggum,Springfield Elementary School,Does that mean you were crazy?,6,mean crazy


## Save Datasets

In [37]:
scripts_df.to_csv('Data/Processed/scripts.csv', index = False)
episods_df.to_csv('Data/Processed/episodes.csv', index = False)