This notebook pulls data from wikipedia, which includes the number of viewers each episode had on its original airing (within the US).

In [78]:
# First 20 seasons
# https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(seasons_1%E2%80%9320)

# Seasons 21 - current
# https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(season_21%E2%80%93present)

from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import re

# Define Functions

In [79]:
def scrape_ep_tables(ep_url):
    """Get episode tables (for each season) from wikipedia"""
    
    # Request from the server the content of the web page by using get()
    response = get(ep_url)

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Extract each season table into ep_tables. Each index is a seperate season
    ep_tables = page_html.find_all('table', class_='wikiepisodetable')
    
    return ep_tables

In [80]:
def extract_row_cell(cells, row_span, row_cont):
    """Extract values from a row of wikitable (accounting for rowspan)"""
    row = []
    column_nm = ['ep_no', 'title', 'directed', 'written', 'airdate', 'prod_code', 'viewers']
    
    i = 0
    for key in column_nm:
        rs = row_span[key]
        rc = row_cont[key]
        
        if rs > 0:
            value = '' #rc
            rs += -1
        else:
            value = cells[i].text.strip()
            if cells[i].has_attr("rowspan"):
                rs = int(cells[i]["rowspan"]) - 1
                rc = value
                
            i += 1

        if rs == 0:
                rc = ''
                
        row.append(value)
        row_span[key] = rs
        row_cont[key] = rc
        
    return row, row_span, row_cont

In [88]:
def extract_ep_details(rows, season):
    """Extract Details of Episodes"""
    
    season_list = []
    rowspan = {'ep_no': 0, 'title': 0, 'directed': 0, 'written': 0, 'airdate': 0, 'prod_code': 0, 'viewers': 0, 'overall_ep': 0}
    row_content = {'ep_no': '', 'title': '', 'directed': '', 'written': '', 'airdate': '', 'prod_code': '', 'viewers': '', 'overall_ep': ''}
    column_nm = ['ep_no', 'title', 'directed', 'written', 'airdate', 'prod_code', 'viewers']
    
    for row in rows:
        ep_header = row.find('th')
        if (ep_header):
            # Update overall ep no if th present
            overall_ep = ep_header.text.strip()

        cols=row.find_all('td')
        if len(cols) > 0: # Account for seemingly empty rows
            line, rowspan, row_content = extract_row_cell(cols, rowspan, row_content)
            line.append(overall_ep) # Append the overall episode number
            line.append(season) # Append Season No
            season_list.append(line)
        
    simpsons_episodes = pd.DataFrame(season_list, columns=['episode', 'ep_title', 'director', 'writer', 
                                                   'air_date', 'prod_code', 'viewers', 'ep_overall', 'season'])  
    return simpsons_episodes

In [89]:
def identify_exceptions(rows):
    """Identify tables which have been found but do not represent a season of episodes (e.g. movie)"""
    skip_flag = False
    
    headers = rows[0].find_all('th')
    for header in headers:
        header = header.text.strip()
        if 'Release date' in header:
            skip_flag = True
            
    return skip_flag

In [90]:
def remove_reference_tags(ep_list):
    """Use regular expressions to identify reference tags and remove them"""
    pattern = r'\[[^\]]*\]'

    ep_list['viewers'] = ep_list['viewers'].apply(lambda x: re.sub(pattern, '', x))
    return ep_list

In [91]:
def remove_quotes(ep_list):
    """Remove quotations from the episode titles"""

    ep_list['ep_title'] = ep_list['ep_title'].apply(lambda x: re.sub('"', '', x))
    return ep_list

In [97]:
def generate_episode_dataframe(ep_tables, max_season = 33):

    ep_list = pd.DataFrame(columns=['episode', 'ep_title', 'director', 'writer', 
                                    'air_date', 'prod_code', 'viewers', 'ep_overall', 'season'])
    season = 1

    for i in range(len(ep_tables)):
        if season >= max_season+1:
            continue

        season_table = ep_tables[i]
        season_rows = season_table.find_all('tr')

        if identify_exceptions(season_rows): # Skip table if not a season of episodes
            continue

        simpsons_episodes = extract_ep_details(season_rows, season)
        ep_list = pd.concat([ep_list, simpsons_episodes])
        season += 1 # Increment season if table not skipped
        
    ep_list = remove_reference_tags(ep_list)
    ep_list = remove_quotes(ep_list)
    
    ep_list.reset_index(drop=True, inplace=True)
        
    return ep_list

# Build Dataset

In [98]:
url1 = 'https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(seasons_1%E2%80%9320)'
url2 = 'https://en.wikipedia.org/wiki/List_of_The_Simpsons_episodes_(season_21%E2%80%93present)'

ep_tables1 = scrape_ep_tables(url1)
ep_tables2 = scrape_ep_tables(url2)

ep_tables = ep_tables1 + ep_tables2

In [99]:
ep_list = generate_episode_dataframe(ep_tables)

In [100]:
ep_list.head(30)

Unnamed: 0,episode,ep_title,director,writer,air_date,prod_code,viewers,ep_overall,season
0,1.0,Simpsons Roasting on an Open Fire,David Silverman,Mimi Pond,"December 17, 1989 (1989-12-17)",7G08,26.7,1,1
1,2.0,Bart the Genius,David Silverman,Jon Vitti,"January 14, 1990 (1990-01-14)",7G02,24.5,2,1
2,3.0,Homer's Odyssey,Wes Archer,Jay Kogen & Wallace Wolodarsky,"January 21, 1990 (1990-01-21)",7G03,27.5,3,1
3,4.0,There's No Disgrace Like Home,Gregg Vanzo & Kent Butterworth,Al Jean & Mike Reiss,"January 28, 1990 (1990-01-28)",7G04,20.2,4,1
4,5.0,Bart the General,David Silverman,John Swartzwelder,"February 4, 1990 (1990-02-04)",7G05,27.1,5,1
5,6.0,Moaning Lisa,Wes Archer,Al Jean & Mike Reiss,"February 11, 1990 (1990-02-11)",7G06,27.4,6,1
6,7.0,The Call of the Simpsons,Wes Archer,John Swartzwelder,"February 18, 1990 (1990-02-18)",7G09,27.6,7,1
7,8.0,The Telltale Head,Rich Moore,"Al Jean, Mike Reiss, Sam Simon & Matt Groening","February 25, 1990 (1990-02-25)",7G07,28.0,8,1
8,9.0,Life on the Fast Lane,David Silverman,John Swartzwelder,"March 18, 1990 (1990-03-18)",7G11,33.5,9,1
9,10.0,Homer's Night Out,Rich Moore,Jon Vitti,"March 25, 1990 (1990-03-25)",7G10,30.3,10,1


In [102]:
ep_list.shape

(763, 9)

# Save Data

In [103]:
ep_list.to_pickle("./data/simpsons_wiki_episodes.pkl")