This notebook pulls data from wikipedia, which includes the number of viewers each episode had on its original airing (within the US).

In [1]:
# First 20 seasons
# https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(seasons_1%E2%80%9320)

# Seasons 21 - current
# https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes

from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define Functions

In [2]:
def scrape_ep_tables(ep_url):
    """Get episode tables (for each season) from wikipedia"""
    
    # Request from the server the content of the web page by using get()
    response = get(ep_url)

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Extract each season table into ep_tables. Each index is a seperate season
    ep_tables = page_html.find_all('table', class_='wikiepisodetable')
    
    return ep_tables

In [7]:
def extract_ep_details(rows):
    """Extract Details of Episodes"""
    
    season_list = []
    for row in rows:
        overall_ep = row.find('th')
        try:
            overall_ep = overall_ep.text.strip()
        except:
            continue;
        cols=row.find_all('td')
        cols=[x.text.strip() for x in cols]
        cols.append(overall_ep) # Append the overall episode number
        cols.append(season) # Append Season No
        season_list.append(cols)
    del season_list[0] # Delete header
    simpsons_episodes = pd.DataFrame(season_list, columns=['episode', 'ep_title', 'director', 'writer', 
                                                   'air_date', 'prod_code', 'viewers', 'ep_overall', 'season'])  
    return simpsons_episodes

In [11]:
def identify_exceptions(rows):
    """Identify tables which have been found but do not represent a season of episodes (e.g. movie)"""
    skip_flag = False
    
    headers = rows[0].find_all('th')
    for header in headers:
        header = header.text.strip()
        if header == 'Screenplay by':
            skip_flag = True
        if 'Household' in header:
            skip_flag = True    
            
    return skip_flag

In [32]:
def remove_reference_tags(ep_list):
    """Use regular expressions to identify reference tags and remove them"""
    pattern = r'\[[^\]]*\]'

    ep_list['viewers'] = ep_list['viewers'].apply(lambda x: re.sub(pattern, '', x))
    return ep_list

# Build Dataset

In [5]:
url1 = 'https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(seasons_1%E2%80%9320)'
url2 = 'https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes'

ep_tables1 = scrape_ep_tables(url1)
ep_tables2 = scrape_ep_tables(url2)

ep_tables = ep_tables1 + ep_tables2

In [10]:
ep_list = pd.DataFrame(columns=['episode', 'ep_title', 'director', 'writer', 
                                'air_date', 'prod_code', 'viewers', 'ep_overall', 'season'])
season = 1
max_season = 31

for i in range(len(ep_tables)):
    if season >= max_season+1:
        continue
        
    season_table = ep_tables[i]
    season_rows = season_table.find_all('tr')
            
    if identify_exceptions(season_rows): # Skip table if not a season of episodes
        continue
        
    simpsons_episodes = extract_ep_details(season_rows)
    ep_list = ep_list.append(simpsons_episodes)
    season += 1 # Increment season if table not skipped

In [31]:
ep_list = remove_reference_tags(ep_list)

In [30]:
ep_list

Unnamed: 0,episode,ep_title,director,writer,air_date,prod_code,viewers,ep_overall,season
0,1,"""Simpsons Roasting on an Open Fire""",David Silverman,Mimi Pond,"December 17, 1989 (1989-12-17)",7G08,26.7,1,1
1,2,"""Bart the Genius""",David Silverman,Jon Vitti,"January 14, 1990 (1990-01-14)",7G02,24.5,2,1
2,3,"""Homer's Odyssey""",Wesley Archer,Jay Kogen & Wallace Wolodarsky,"January 21, 1990 (1990-01-21)",7G03,27.5,3,1
3,4,"""There's No Disgrace Like Home""",Gregg Vanzo & Kent Butterworth,Al Jean & Mike Reiss,"January 28, 1990 (1990-01-28)",7G04,20.2,4,1
4,5,"""Bart the General""",David Silverman,John Swartzwelder,"February 4, 1990 (1990-02-04)",7G05,27.1,5,1
5,6,"""Moaning Lisa""",Wesley Archer,Al Jean & Mike Reiss,"February 11, 1990 (1990-02-11)",7G06,27.4,6,1
6,7,"""The Call of the Simpsons""",Wesley Archer,John Swartzwelder,"February 18, 1990 (1990-02-18)",7G09,27.6,7,1
7,8,"""The Telltale Head""",Rich Moore,"Al Jean, Mike Reiss, Sam Simon & Matt Groening","February 25, 1990 (1990-02-25)",7G07,28,8,1
8,9,"""Life on the Fast Lane""",David Silverman,John Swartzwelder,"March 18, 1990 (1990-03-18)",7G11,33.5,9,1
9,10,"""Homer's Night Out""",Rich Moore,Jon Vitti,"March 25, 1990 (1990-03-25)",7G10,30.3,10,1


# Save Data

In [33]:
ep_list.to_pickle("./simpsons_wiki_episodes.pkl")