The notebook loops through each season of the Simpsons on IMDb and pulls the basic information provided.

As I want to keep the number of hits on IMDb to a reasonable amount, the loop only pulls data from the season tables, and does not drill down into each episode (despite that offering more data).

In [1]:
# Simpsons IMDb link
# https://www.imdb.com/title/tt0096697/episodes?season=1

from requests import get
from bs4 import BeautifulSoup
import pandas as pd

The fields found on the webpage correspond to the descriptions given below

In [1]:
# Below are outdated

#episode_containers[0].a['title']) # Title
#episode_containers[0].meta['content']) # Episode Number
#episode_containers[0].find('div', class_='airdate').text.strip() # Air Date
#episode_containers[0].find('span', class_='ipl-rating-star__rating').text # Episode Rating
#episode_containers[0].find('span', class_='ipl-rating-star__total-votes').text # Total No. Of Votes
#episode_containers[0].find('div', class_='item_description').text.strip() # Episode Description

# Defining Functions

In [3]:
def get_season_url(season):
    """Return the URL for a chosen season"""
    simpsons_url = 'https://www.imdb.com/title/tt0096697/episodes?season='
    season_url = simpsons_url + str(season)
    return season_url

In [34]:
def extract_season_data(season_url):
    """Extract the webpage data and pull out each individual episode into an 'episode container' which contains its
    row in the table."""
        
    # Request from the server the content of the web page by using get()
    response = get(season_url, headers={'User-Agent':'Mozilla/5.0'})

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Select all the episode containers from the season's page
    episode_containers = page_html.find_all('article', class_='episode-item-wrapper')
    
    return episode_containers

In [133]:
def extract_ep_data_from_container(episode_container, season):
    """Extract the individual field data from the 'episode container' of a single episode."""
    
    try:
        name = episode_container.find('div', class_='ipc-title__text').text.strip()
        title_divider = name.index('∙')
        episode_number = name[:title_divider]
        ep_index = episode_number.index('.')
        episode_number = episode_number[ep_index+2:]
        title = name[title_divider+1:]
    except:
        title = ''
        episode_number = ''

    try:
        airdate = episode_container.find('span', class_='jAfkDE').text.strip()
    except:
        airdate = ''

    try:
        rating = episode_container.find('span', class_='ipc-rating-star').text.strip()
        rating_index = rating.index('/')
        rating = rating[:rating_index]
    except:
        rating = ''

    try:
        total_votes = episode_container.find('span', class_='ipc-rating-star--voteCount').text
        total_votes = total_votes.replace('(','')
        total_votes = total_votes.replace(')','')
    except:
        total_votes = ''

    try:
        desc = episode_container.find('div', class_='ipc-html-content-inner-div').text.strip()
    except:
        desc = ''
        
    # Compiling the episode info
    episode_data = [season, episode_number, title, airdate, rating, total_votes, desc]
    
    return episode_data

In [113]:
def convert_ep_array_to_dataframe(episodes):
    """Convert the array of episode data into a pandas dataframe"""
    episodes = pd.DataFrame(episodes, columns=['season', 'episode_no', 'title', 
                                               'airdate', 'rating', 'total_votes', 'desc'])
    return episodes  

In [134]:
def multiply_votes(vote_str):
    """Convert the total_votes extracted from IMDB into full numbers (remove the use of K 
    to indicate thousands)"""
    try:
        index = vote_str.index('K')
    except:
        index = None
        
    if index:
        vote_str = vote_str[:index]
        vote = float(vote_str)*1000
    else:
        vote = float(vote_str)
        
    return int(vote)

In [142]:
def gather_imdb_data(max_season = 33):
    """Gather data from IMDb for episodes of the simpsons"""
    # Initializing the series that the loop will populate
    ep_array = []
    
    # For every season in the series-- range depends on the show
    for sn in range(1, max_season+1):
        season_url = get_season_url(sn)
        episode_containers = extract_season_data(season_url)

        # For each episode in each season
        for episode in episode_containers:
            episode_data = extract_ep_data_from_container(episode, sn)

            # Append the episode info to the complete dataset
            ep_array.append(episode_data)

    ep_dataframe = convert_ep_array_to_dataframe(ep_array)
    
    ep_dataframe['total_votes'] = ep_dataframe['total_votes'].apply(multiply_votes)
    
    return ep_dataframe

# Gather Data

In [137]:
simpsons_episodes = gather_imdb_data()

In [138]:
simpsons_episodes.head()

Unnamed: 0,season,episode_no,title,airdate,rating,total_votes,desc
0,1,1,Simpsons Roasting on an Open Fire,"Sun, Sep 2, 1990",8.1,8400,Homer is forced to become a department store S...
1,1,2,Bart the Genius,"Sun, Jan 14, 1990",7.6,5700,Bart ends up at a school for gifted children a...
2,1,3,Homer's Odyssey,"Sun, Jan 21, 1990",7.3,5000,"After losing his job, Homer contemplates endin..."
3,1,4,There's No Disgrace Like Home,"Sun, Jan 28, 1990",7.6,4900,After being embarrassed by the rest of the fam...
4,1,5,Bart the General,"Sun, Feb 4, 1990",7.9,5300,After being beaten up by Nelson Muntz one too ...


In [139]:
simpsons_episodes.to_pickle("./data/simpsons_episodes.pkl")

In [140]:
print('Data was extracted for {} episodes.'.format(simpsons_episodes.shape[0]))

Data was extracted for 727 episodes.


In [141]:
simpsons_episodes

Unnamed: 0,season,episode_no,title,airdate,rating,total_votes,desc
0,1,1,Simpsons Roasting on an Open Fire,"Sun, Sep 2, 1990",8.1,8400,Homer is forced to become a department store S...
1,1,2,Bart the Genius,"Sun, Jan 14, 1990",7.6,5700,Bart ends up at a school for gifted children a...
2,1,3,Homer's Odyssey,"Sun, Jan 21, 1990",7.3,5000,"After losing his job, Homer contemplates endin..."
3,1,4,There's No Disgrace Like Home,"Sun, Jan 28, 1990",7.6,4900,After being embarrassed by the rest of the fam...
4,1,5,Bart the General,"Sun, Feb 4, 1990",7.9,5300,After being beaten up by Nelson Muntz one too ...
...,...,...,...,...,...,...,...
722,33,18,My Octopus and a Teacher,"Sun, May 8, 2022",6.6,613,Bart has romantic feelings for his new teacher...
723,33,19,Girls Just Shauna Have Fun,"Sun, May 15, 2022",6.6,592,Lisa finds an unlikely mentor in Shauna Chalme...
724,33,20,Marge the Meanie,"Fri, May 27, 2022",6.7,564,Marge bonds with Bart when she discovers a sec...
725,33,21,Meat Is Murder,"Fri, Jun 3, 2022",6.0,616,Grampa reconnects with an old associate in the...
