## Info
In this notebook I've provided the primary code I used to collect data for this project and build my dataset. Most of it was scraped from a handful of sites, each noted below. My workflow was not linear and so I've pieced this together as best I can, and hopefully in a way that can be reasonably understood by anyone reading it.

## Imports

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import requests
import re
from bs4 import BeautifulSoup
from datetime import date
from fake_useragent import UserAgent
import pickle

# I built and ran the below functions direcectly in my scratch notebooks and cannot in this context import them correctly.
# All referenced functions are viewable in the included simpsons_data_functions.py file

import simpsons_data_functions

## Scraping and cleaning

In [2]:
ua = UserAgent()
user_agent = {'User-agent': ua.random}

In [None]:
everything = get_simpsons_data([i for i in range(1, 31)])

In [None]:
# Quick look at season mean

sns.set_theme(style="white", context="talk")
enmax_palette = ["#FFD99F", "#82CBEC", "#424F46"]
color_codes_wanted = ['simpsons_yellow', 'simpsons_blue', 'simpsons_black']
c = lambda x: enmax_palette[color_codes_wanted.index(x)]
plt.figure(figsize=(10, 4), dpi=200, frameon=False, edgecolor="white")
sns.barplot(x="Season", y="Rating", data=for_modeling, color="coral", ci=None)
plt.yticks(ticks=[5, 5.5, 6, 6.5, 7, 7.5, 8, 8.5, 9, 9.5, 10], size=9, color="black")
plt.ylim(5, 9.5)
plt.ylabel("", fontdict={"fontsize": 14, "color": "black", "weight": "black"})
plt.xticks(size=9, color="black")
plt.xlabel("", fontdict={"fontsize": 14, "color": "black", "weight": "black"}, y=1.5);
plt.text(-.32, 7.8, "7.7", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(.67, 8.1, "8.1", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(1.65, 8.25, "8.2", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(2.66, 8.36, "8.3", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(3.66, 8.44, "8.4", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(4.66, 8.43, "8.4", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(5.66, 8.42, "8.4", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(6.66, 8.31, "8.3", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(7.66, 7.86, "7.8", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(8.65, 7.64, "7.6", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(9.65, 7.31, "7.3", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(10.64, 7.36, "7.3", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(11.64, 7.11, "7.1", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(12.64, 7.04, "7.0", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(13.64, 6.95, "6.9", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(14.64, 6.93, "6.9", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(15.64, 6.82, "6.8", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(16.64, 6.86, "6.8", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(17.64, 6.86, "6.8", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(18.63, 6.82, "6.8", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(19.63, 6.74, "6.7", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(20.63, 6.65, "6.6", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(21.63, 6.815, "6.8", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(22.63, 6.62, "6.6", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(23.63, 6.615, "6.6", fontdict={"fontsize": 10, "color": "black", "weight": "black"})
plt.text(24.63, 6.58, "6.6", fontdict={"fontsize": 10, "color": "black", "weight": "black"})

In [None]:
# Quick look at ratings in a countplot

plt.figure(figsize=(15, 6))
sns.countplot(x="Rating", data=for_modeling);

In [None]:
# Cleaning character names and viewing appearances

all_character_appearances = []
for ep in everything["Characters"]:
    for i in range(len(ep)):
        all_character_appearances.append(ep[i])
character_appearances_series = pd.Series(all_character_appearances, name="Character appearances")
character_appearances_series.replace(to_replace={"(Homer)(.)*": "Homer Simpson"}, regex=True, inplace=True)
character_appearances_series.replace(to_replace={"(Grampa)(.)*": "Grampa Simpson"}, regex=True, inplace=True)
character_appearances_series.replace(to_replace={"(Abraham Simpson)(.)*": "Grampa Simpson"}, regex=True, inplace=True)
character_appearances_series.replace(to_replace={"Abe Simpson": "Grampa Simpson"}, inplace=True)
character_appearances_series.replace(to_replace={"Barney": "Barney Gumble"}, inplace=True)
character_appearances_series.replace(to_replace={"(Mont(.*))(.)*(Burns)": "Mr. Burns"}, regex=True, inplace=True)
character_appearances_series.replace(to_replace={"Carl Carlson": "Carl"}, inplace=True)
character_appearances_series.replace(to_replace={"Lenny Leonard": "Lenny"}, inplace=True)
character_appearances_series.replace(to_replace={"Otto Mann": "Otto"}, inplace=True)
character_appearances_series.replace(to_replace={"(^Principal Seymour Skinner|^Skinner)": "Principal Skinner"}, regex=True, inplace=True)
character_appearances_series.replace(to_replace={"Seymour Skinner": "Principal Skinner"}, inplace=True)
character_appearances_series.replace(to_replace={"Krusty": "Krusty the Klown"}, inplace=True)
character_appearances_series.replace(to_replace={"Apu Nahasapeemapetilon": "Apu"}, inplace=True)
character_appearances_series.replace(to_replace={"Waylon Smithers": "Smithers"}, inplace=True)
character_appearances_series.replace(to_replace={"Gil Gunderson": "Gil"}, inplace=True)
character_appearances_series.replace(to_replace={"Captain McCallister": "Sea Captain"}, inplace=True)
character_appearances_series.replace(to_replace={"The Sea Captain": "Sea Captain"}, inplace=True)
character_appearances_series.replace(to_replace={"Captain Horatio McCallister": "Sea Captain"}, inplace=True)
character_appearances_series.replace(to_replace={"Captain McAllister": "Sea Captain"}, inplace=True)
character_appearances_series.replace(to_replace={"Kearney Zzyzwicz": "Kearney"}, inplace=True)
character_appearances_series.replace(to_replace={"Moe": "Moe Szyslak"}, inplace=True)
character_appearances_series.replace(to_replace={"(^Dr. Julius Hibbert|^Hibbert)": "Dr. Hibbert"}, regex=True, inplace=True)
character_appearances_series.replace(to_replace={"(^Prof. Frink|^Frink)": "Professor Frink"}, regex=True, inplace=True)
character_appearances_series.replace(to_replace={"Ralph": "Ralph Wiggum"}, inplace=True)
character_appearances_series.replace(to_replace={"Quimby": "Mayor Quimby"}, inplace=True)
character_appearances_series.replace(to_replace={"Willie": "Groundskeeper Willie"}, inplace=True)
character_appearances_series.replace(to_replace={"Groundskeeper WIllie": "Groundskeeper Willie"}, inplace=True)
character_appearances_series.replace(to_replace={"Superintendant Chalmers": "Superintendent Chalmers"}, inplace=True)
character_appearances_series.replace(to_replace={"Chalmers": "Superintendant Chalmers"}, inplace=True)
character_appearances_series.replace(to_replace={"Charles Mr. Burns": "Mr. Burns"}, inplace=True)
character_appearances_series.replace(to_replace={"Abraham 'Grampa Simpson": "Grampa Simpson"}, inplace=True)
character_appearances_series.replace(to_replace={"Hans Moleman": "Moleman"}, inplace=True)
character_appearances_series.replace(to_replace={"Maggie": "Maggie Simpson"}, inplace=True)
character_appearances_series.replace(to_replace={"Chief Clancy Wiggum": "Chief Wiggum"}, inplace=True)
character_appearances_series.replace(to_replace={"Ned": "Ned Flanders"}, inplace=True)
character_appearances_series.replace(to_replace={"Officer Lou": "Lou"}, inplace=True)

character_appearances_df = pd.DataFrame(character_appearances_series.value_counts())
character_appearances_df = character_appearances_df.reset_index()
character_appearances_df.rename(columns={"index": "Character"}, inplace=True)

In [None]:
# Converting "Airdate" column to datetime and new column "Year"

everything["Year"] = everything["Airdate"].dt.year

In [None]:
# Renaming writers column

everything.rename(columns={"Writers/directors": "Writing, etc., credits"}, inplace=True)

In [None]:
# Getting episode writers as a single string

new_writers_column = []
for episode in list(everything["Writing, etc., credits"]):
    episode_writers = []
    for person in episode:
        for credit in writing_credits_list:
            if credit in person:
                episode_writers.append(person.split("(")[0].strip())
                break
    new_writers_column.append(", ".join(list(set(episode_writers))))

everything["Written by"] = new_writers_column

In [None]:
# Downloaded the CSV directly from Kaggle and then read it locally
 
dialogue_lines = pd.read_csv("simpsons_script_lines.csv")


In [None]:
# Identifying missing seasons

dialogue_lines.sort_values(by="episode_id")["episode_id"].unique()

In [None]:
# Removing non-spoken lines, et.c, and renaming columns

dialogue_lines.drop(columns=["raw_text", "spoken_words", "timestamp_in_ms", "id", "number", "location_id", "raw_location_text"], inplace=True)
dialogue_lines.rename(columns={"speaking_line": "Speaking line", "episode_id": "Episode number", "character_id": "Character ID", "raw_character_text": "Character", "spoken_words": "Lines", "normalized_text": "Raw text", "word_count": "Word count"}, inplace=True)
dialogue_lines["Speaking line"].replace(to_replace={"true": True, "false": False}, inplace=True)
dialogue_lines = dialogue_lines[dialogue_lines["Speaking Line"] == True]

In [None]:
# Looking at what dialogue looks like for a single episode

dialogue_lines[dialogue_lines["episode_id"] == 52]
homer_at_the_bat["word_count"] = homer_at_the_bat["word_count"].astype(int)
total_words = homer_at_the_bat["word_count"].sum()
homer_at_the_bat["percentage_of_words"] = homer_at_the_bat["word_count"].apply(lambda x: (x*100)/total_words)


In [None]:
# Adding a column detailing whether an episode was a "Treehouse of Horror" episode
# Note: I understand now that this should be done after splitting into training and testing sets

everything["Treehouse of Horror?"] = everything.index.str.contains("Treehouse of Horror").astype(str)
everything["Treehouse of Horror?"].replace({"False": 0, "True": 1}, inplace=True)

In [18]:
# Adding a column detailing whether an episode was a season premiere or not
# Note: I understand now that this should be done after splitting into training and testing sets

everything["Season premier"] = everything["Episode number"] == 1
everything["Season premier"].replace(to_replace={True: 1, False: 0}, inplace=True)

In [None]:
# Adding columns detailing whether an episode's main storylines included one of the 
# four main characters: Homer, Marge, Bart, and Lisa
# Note: I understand now that this should be done after splitting into training and testing sets

everything["Homer story"] = everything["Description"].str.contains("Homer")
everything["Homer story"].replace(to_replace={True: 1, False: 0}, inplace=True)
everything["Bart story"] = everything["Description"].str.contains("Bart")
everything["Bart story"].replace(to_replace={True: 1, False: 0}, inplace=True)
everything["Lisa story"] = everything["Description"].str.contains("Lisa")
everything["Lisa story"].replace(to_replace={True: 1, False: 0}, inplace=True)
everything["Marge story"] = everything["Description"].str.contains("Bart")
everything["Marge story"].replace(to_replace={True: 1, False: 0}, inplace=True)
everything["Milhouse story"] = everything["Description"].str.contains("Milhouse")
everything["Milhouse story"].replace(to_replace={True: 1, False: 0}, inplace=True)

In [None]:
# Adding columns detailing how writers are top-line credited for each episode

writer_count_column = []
for episode in list(everything["Writing, etc., credits"]):
    episode_writers = []
    for person in episode:
        for credit in writing_credits_list:
            if credit in person:
                episode_writers.append(person.split("(")[0].strip())
                break
    writer_count_column.append(len(episode_writers))
everything["Number of writers"] = writer_count_column

In [None]:
# Adding a column detailing whether an episode was written by John Swartzwelder

everything["Written by John Swartzwelder"] = everything["Written by"].str.contains("John Swartzwelder")
everything["Written by John Swartzwelder"].replace(to_replace={True: 1, False: 0}, inplace=True)

In [None]:
# Pulling percentage of episode dialogue for each character in each episode

homer_percentage = homer_series()
bart_percentage = bart_series()
lisa_percentage = lisa_series()
marge_percentage = marge_series()
moe_percentage = moe_series()
milhouse_percentage = milhouse_series()
mrburns_percentage = mrburns_series()
grampa_percentage = grampa_series()
flanders_percentage = flanders_series()
skinner_percentage = skinner_series()

In [None]:
# Copying main df into a new one with updated row

new_everything = everything[:568]

In [None]:
# Creating columns for each character's dialogue contribution as a percentage of the total.
# In the case of the secondary characters, contribution is measured as a percentage of the 
# words not spoken by any of the four main characters.

new_everything["Homer %"] = [i/100 for i in homer_percentage]
new_everything["Bart %"] = [i/100 for i in bart_percentage]
new_everything["Lisa %"] = [i/100 for i in lisa_percentage]
new_everything["Marge %"] = [i/100 for i in marge_percentage]
new_everything["Family total"] = new_everything["Homer %"] + new_everything["Marge %"] + new_everything["Bart %"] + new_everything["Lisa %"]
new_everything["Moe %"] = [i/100 for i in moe_percentage]
new_everything["Moe % v2"] = [i/100 for i in moe_percentage] / (1 - new_everything["Family total"])
new_everything["Milhouse %"] = [i/100 for i in milhouse_percentage]
new_everything["Milhouse % v2"] = [i/100 for i in milhouse_percentage] / (1 - new_everything["Family total"])
new_everything["Mr. Burns %"] = [i/100 for i in mrburns_percentage]
new_everything["Mr. Burns % v2"] = [i/100 for i in mrburns_percentage] / (1 - new_everything["Family total"])
new_everything["Grampa %"] = [i/100 for i in grampa_percentage]
new_everything["Grampa % v2"] = [i/100 for i in grampa_percentage] / (1 - new_everything["Family total"])
new_everything["Flanders %"] = [i/100 for i in flanders_percentage]
new_everything["Flanders % v2"] = [i/100 for i in flanders_percentage] / (1 - new_everything["Family total"])
new_everything["Skinner %"] = [i/100 for i in skinner_percentage]
new_everything["Skinner % v2"] = [i/100 for i in skinner_percentage] / (1 - new_everything["Family total"])

In [None]:
# Creating new column that is distance from each episode's season mid-point

dicto = new_everything.groupby("Season")["Episode number"].count().reset_index()
dicto = dict(zip(dicto["Season"], dicto["Episode number"]))
season_length_adj = []
for value in dicto.values():
    half = value / 2
    for i in range(value):
        season_length_adj.append(half)
new_everything["Season mid_point"] = season_length_adj
new_everything["When in season"] = np.abs((new_everything["Episode number"] - new_everything["Season mid_point"]) / new_everything["Season mid_point"])

In [None]:
# Getting guest stars from TVDB

guest_stars = guest_stars([i for i in range(27)])

In [None]:
# Cleaning out "guest stars" that have appeared many times Pamela Hayden, who does Milhouse, 
# Jimbo and Rod Flanders, has done > 600 episodes. Tress MacNeille, who does Agnes and
# others, has appeared in > 550 episodes. Maggie Roswell, who does Helen Lovejoy, Maude Flanders 
# and Miss Hoover, has done > 200 episodes.

for item in guest_stars:
    if [] in item:
        item.remove([])

for item in guest_stars:
    if 'Pamela Hayden' in item:
        item.remove('Pamela Hayden')

for item in guest_stars:
    if 'Tress MacNeille' in item:
        item.remove('Tress MacNeille')

for item in guest_stars:
    if 'Maggie Roswell' in item:
        item.remove('Maggie Roswell')

In [None]:
# Creating "Number of Guest Stars" column

new_everything["Number of guest stars"] = new_everything["Guest stars"].apply(lambda x: len(x))

In [None]:
# Scraping ratings/rater data from IMDB and creating columns for each segment

ratings_data = get_imdb_ratings_info([i for i in range(1, 27)])
rating_demos = []
for season, data in ratings_data.items():
    for name, ratings in data.items():
        ep_list = []
        for key, demos in ratings[1].items():
            ep_list.append([key, demos])
        rating_demos.append(ep_list)
                
all_raters = []
male_raters = []
female_raters = []
for episode in rating_demos:
    all_raters.append(episode[0])
    male_raters.append(episode[1])
    female_raters.append(episode[2])
    
all_raters_v2 = [episode[1] for episode in all_raters]
male_raters_v2 = [episode[1] for episode in male_raters]
female_raters_v2 = [episode[1] for episode in female_raters]
all_all, all_minor, all_18_29, all_30_44, all_45_up = make_rater_columns(all_raters_v2)
male_all, male_minor, male_18_29, male_30_44, male_45_up = make_rater_columns(male_raters_v2)
female_all, female_minor, female_18_29, female_30_44, female_45_up = make_rater_columns(female_raters_v2)

new_everything["All ratings"] = all_all[:568]
new_everything["<18 ratings"] = all_minor[:568]
new_everything["18-29 ratings"] = all_18_29[:568]
new_everything["30-44 ratings"] = all_30_44[:568]
new_everything["45+ ratings"] = all_45_up[:568]

new_everything["Male: All ratings"] = male_all[:568]
new_everything["Male: <18 ratings"] = male_minor[:568]
new_everything["Male: 18-29 ratings"] = male_18_29[:568]
new_everything["Male: 30-44 ratings"] = male_30_44[:568]
new_everything["Male: 45+ ratings"] = male_45_up[:568]

new_everything["Female: All ratings"] = female_all[:568]
new_everything["Female: <18 ratings"] = female_minor[:568]
new_everything["Female: 18-29 ratings"] = female_18_29[:568]
new_everything["Female: 30-44 ratings"] = female_30_44[:568]
new_everything["Female: 45+ ratings"] = female_45_up[:568]

# Parsing and removing the old columns (this was inefficient)

new_everything["All: Number of ratings"] = new_everything["All ratings"].apply(lambda x: x[1])
new_everything["All: Rating"] = new_everything["All ratings"].apply(lambda x: x[0])
new_everything["All <18: Number of ratings"] = new_everything["<18 ratings"].apply(lambda x: x[1])
new_everything["All <18: Rating"] = new_everything["<18 ratings"].apply(lambda x: x[0])
new_everything["All 18-29: Number of ratings"] = new_everything["18-29 ratings"].apply(lambda x: x[1])
new_everything["All 18-29: Rating"] = new_everything["18-29 ratings"].apply(lambda x: x[0])
new_everything["All 30-44: Number of ratings"] = new_everything["30-44 ratings"].apply(lambda x: x[1])
new_everything["All 30-44: Rating"] = new_everything["30-44 ratings"].apply(lambda x: x[0])
new_everything["All 45+: Number of ratings"] = new_everything["45+ ratings"].apply(lambda x: x[1])
new_everything["All 45+: Rating"] = new_everything["45+ ratings"].apply(lambda x: x[0])
new_everything.drop(columns=["All ratings", "<18 ratings", "18-29 ratings", "30-44 ratings", "45+ ratings"], inplace=True)

new_everything["Male: Number of ratings"] = new_everything["Male: All ratings"].apply(lambda x: x[1])
new_everything["Male: Rating"] = new_everything["Male: All ratings"].apply(lambda x: x[0])
new_everything["Male <18: Number of ratings"] = new_everything["Male: <18 ratings"].apply(lambda x: x[1])
new_everything["Male <18: Rating"] = new_everything["Male: <18 ratings"].apply(lambda x: x[0])
new_everything["Male 18-29: Number of ratings"] = new_everything["Male: 18-29 ratings"].apply(lambda x: x[1])
new_everything["Male 18-29: Rating"] = new_everything["Male: 18-29 ratings"].apply(lambda x: x[0])
new_everything["Male 30-44: Number of ratings"] = new_everything["Male: 30-44 ratings"].apply(lambda x: x[1])
new_everything["Male 30-44: Rating"] = new_everything["Male: 30-44 ratings"].apply(lambda x: x[0])
new_everything["Male 45+: Number of ratings"] = new_everything["Male: 45+ ratings"].apply(lambda x: x[1])
new_everything["Male 45+: Rating"] = new_everything["Male: 45+ ratings"].apply(lambda x: x[0])
new_everything.drop(columns=["Male: All ratings", "Male: <18 ratings", "Male: 18-29 ratings", "Male: 30-44 ratings", "Male: 45+ ratings"], inplace=True)

new_everything["Female: Number of ratings"] = new_everything["Female: All ratings"].apply(lambda x: x[1])
new_everything["Female: Rating"] = new_everything["Female: All ratings"].apply(lambda x: x[0])
new_everything["Female <18: Number of ratings"] = new_everything["Female: <18 ratings"].apply(lambda x: x[1])
new_everything["Female <18: Rating"] = new_everything["Female: <18 ratings"].apply(lambda x: x[0])
new_everything["Female 18-29: Number of ratings"] = new_everything["Female: 18-29 ratings"].apply(lambda x: x[1])
new_everything["Female 18-29: Rating"] = new_everything["Female: 18-29 ratings"].apply(lambda x: x[0])
new_everything["Female 30-44: Number of ratings"] = new_everything["Female: 30-44 ratings"].apply(lambda x: x[1])
new_everything["Female 30-44: Rating"] = new_everything["Female: 30-44 ratings"].apply(lambda x: x[0])
new_everything["Female 45+: Number of ratings"] = new_everything["Female: 45+ ratings"].apply(lambda x: x[1])
new_everything["Female 45+: Rating"] = new_everything["Female: 45+ ratings"].apply(lambda x: x[0])
new_everything.drop(columns=["Female: All ratings", "Female: <18 ratings", "Female: 18-29 ratings", "Female: 30-44 ratings", "Female: 45+ ratings"], inplace=True)

In [None]:
# Scraping TV viewership data first for seasons 1-20...
# Source: https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(seasons_1%E2%80%9320)#Episodes

url = "https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(seasons_1%E2%80%9320)#Episodes"
response = requests.get(url, headers=user_agent)
print(response.status_code)
ratings_text = response.text
ratings_soup = BeautifulSoup(ratings_text)

viewers_per_episode = []
episode_names = []
for season in ratings_soup.find_all("table", class_="wikitable plainrowheaders wikiepisodetable"):
    header_test = season.find("tr")
    if header_test.findNext("th").text == "Title":
        continue
    for ep in season.find_all("tr", class_="vevent"):
        episode_names.append((ep.find_all("td")[1].text))
        rating = (ep.find_all("td")[6].text.split("[")[0])
        if rating == "N/A" or rating == "TBD":
            viewers_per_episode.append(np.nan)
        else:
            print(rating)
            viewers_per_episode.append(float(rating))

# ...and then seasons 21-27
# Source: # Source: https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes

url = "https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes"
response = requests.get(url, headers=user_agent)
print(response.status_code)
ratings_text = response.text
ratings_soup = BeautifulSoup(ratings_text)

viewers_per_episode21_on = []
episode_names21_on = []
for season in ratings_soup.find_all("table", class_="wikitable plainrowheaders wikiepisodetable"):
    header_test = season.find("tr")
    if header_test.findNext("th").text == "Title":
        continue
    for ep in season.find_all("tr", class_="vevent"):
        episode_names21_on.append((ep.find_all("td")[1].text))
        rating = (ep.find_all("td")[6].text.split("[")[0])
        if rating == "N/A" or rating == "TBD":
            viewers_per_episode21_on.append(np.nan)
        else:
            print(rating)
            viewers_per_episode21_on.append(float(rating))

viewers_per_episode.extend(viewers_per_episode21_on)
new_everything["TV viewers"] = viewers_per_episode[:568]

### Dropping NaNs

In [16]:
new_everything.dropna(inplace=True)

### Dropping outlier "Lisa Goes Gaga" (3 std from mean with 3.9 rating)


In [None]:
new_everything.drop("Lisa Goes Gaga", inplace=True)

## Pickling dataset

In [None]:
with open("simpsons_full.pickle", "wb") as to_write:
    pickle.dump(new_everything, to_write)

## Feature engineering

In [None]:
# Creating "Director mean" column

ratings_mean = new_everything["Rating"].mean()
ratings_std = new_everything["Rating"].std()
by_mean_rating = new_everything.groupby("Director")["Rating"].mean().reset_index()
director_dict = {}
for i in range(len(list(by_mean_rating["Director"]))):
    director_dict[by_mean_rating.iloc[i, 0]] = by_mean_rating.iloc[i, 1]
director_by_mean = pd.DataFrame.from_dict(director_dict, orient='index')

director_mean = director_by_mean.mean()
director_std = director_by_mean.std()
director_scores = (director_by_mean.loc[:, 0]-director_mean)/director_std
new_everything["Director mean"] = new_everything"Director"].replace(to_replace=director_dict)

In [None]:
# Creating "Season score" column
 
season_means = new_everything.groupby("Season")["Rating"].describe()["mean"]
season_scores = (season_means-season_means.describe().loc["mean"])/season_means.describe().loc["std"]
score_dict = pd.Series.to_dict(season_scores)
new_everything["Season score"] = new_everything["Season"].replace(to_replace=score_dict)

In [None]:
# Interacting secondary character dialogue percentages with "Season score"

new_everything["Milhouse multiplied"] = new_everything["Milhouse % v2"] * new_everything["Season score"]
new_everything["Moe multiplied"] = new_everything["Moe % v2"] * new_everything["Season score"]
new_everything["Mr. Burns multiplied"] = new_everything["Mr. Burns % v2"] * new_everything["Season score"]
new_everything["Grampa multiplied"] = new_everything["Grampa % v2"] * new_everything["Season score"]
new_everything["Flanders multiplied"] = new_everything["Flanders % v2"] * new_everything["Season score"]
new_everything["Skinner multiplied"] = new_everything["Skinner % v2"] * new_everything["Season score"]

In [None]:
# Interacting "Written by John Swartzwelder" with "Season score"

new_everything["John Swartzwelder multiplied"] = new_everything["Written by John Swartzwelder"] * new_everything["Season score"]

In [None]:
# Interacting "Written by John Swartzwelder" with "Season score"

new_everything["Number of guest stars multiplied"] = new_everything["Number of guest stars"] * new_everything["Season score"]