The notebook loops through each season of the Simpsons on IMDb and pulls the basic information provided.

As I want to keep the number of hits on IMDb to a reasonable amount, the loop only pulls data from the season tables, and does not drill down into each episode (despite that offering more data).

In [1]:
# Simpsons IMDb link
# https://www.imdb.com/title/tt0096697/episodes?season=1

from requests import get
from bs4 import BeautifulSoup
import pandas as pd

The fields found on the webpage correspond to the descriptions given below

In [2]:
#episode_containers[0].a['title']) # Title
#episode_containers[0].meta['content']) # Episode Number
#episode_containers[0].find('div', class_='airdate').text.strip() # Air Date
#episode_containers[0].find('span', class_='ipl-rating-star__rating').text # Episode Rating
#episode_containers[0].find('span', class_='ipl-rating-star__total-votes').text # Total No. Of Votes
#episode_containers[0].find('div', class_='item_description').text.strip() # Episode Description

# Defining Functions

In [3]:
def get_season_url(season):
    """Return the URL for a chosen season"""
    simpsons_url = 'https://www.imdb.com/title/tt0096697/episodes?season='
    season_url = simpsons_url + str(season)
    return season_url

In [4]:
def extract_season_data(season_url):
    """Extract the webpage data and pull out each individual episode into an 'episode container' which contains its
    row in the table."""
        
    # Request from the server the content of the web page by using get()
    response = get(season_url)

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Select all the episode containers from the season's page
    episode_containers = page_html.find_all('div', class_='info')
    
    return episode_containers

In [5]:
def extract_ep_data_from_container(episode_container, season):
    """Extract the individual field data from the 'episode container' of a single episode."""
    
    # Get the info of each episode on the page
    try:
        episode_number = episode_container.meta['content']
    except:
        episode_number = ''

    try:
        title = episode_container.a['title']
    except:
        title = ''

    try:
        airdate = episode_container.find('div', class_='airdate').text.strip()
    except:
        airdate = ''

    try:
        rating = episode_container.find('span', class_='ipl-rating-star__rating').text
    except:
        rating = ''

    try:
        total_votes = episode_container.find('span', class_='ipl-rating-star__total-votes').text
        total_votes = total_votes.strip("()")
    except:
        total_votes = ''

    try:
        desc = episode_container.find('div', class_='item_description').text.strip()
    except:
        desc = ''
        
    # Compiling the episode info
    episode_data = [season, episode_number, title, airdate, rating, total_votes, desc]
    
    return episode_data

In [6]:
def convert_ep_array_to_dataframe(episodes):
    """Convert the array of episode data into a pandas dataframe"""
    episodes = pd.DataFrame(episodes, columns=['season', 'episode_no', 'title', 
                                               'airdate', 'rating', 'total_votes', 'desc'])
    return episodes  

In [7]:
def gather_imdb_data():
    """Gather data from IMDb for episodes of the simpsons"""
    # Initializing the series that the loop will populate
    ep_array = []
    max_season = 31
    
    # For every season in the series-- range depends on the show
    for sn in range(1, max_season+1):
        season_url = get_season_url(sn)
        episode_containers = extract_season_data(season_url)

        # For each episode in each season
        for episode in episode_containers:
            episode_data = extract_ep_data_from_container(episode, sn)

            # Append the episode info to the complete dataset
            ep_array.append(episode_data)

    ep_dataframe = convert_ep_array_to_dataframe(ep_array)
    
    return ep_dataframe

# Gather Data

In [8]:
simpsons_episodes = gather_imdb_data()

In [9]:
simpsons_episodes.head()

Unnamed: 0,season,episode_no,title,airdate,rating,total_votes,desc
0,1,1,Simpsons Roasting on an Open Fire,2 Sep. 1990,8.2,5966,The family is forced to spend all of their sav...
1,1,2,Bart the Genius,14 Jan. 1990,7.7,3834,Bart ends up at a school for gifted children a...
2,1,3,Homer's Odyssey,21 Jan. 1990,7.4,3371,"After losing his job, Homer contemplates endin..."
3,1,4,There's No Disgrace Like Home,28 Jan. 1990,7.7,3301,After being embarrassed by the rest of the fam...
4,1,5,Bart the General,4 Feb. 1990,8.0,3353,After being beaten up by Nelson Muntz one too ...


In [10]:
simpsons_episodes.to_pickle("./simpsons_episodes.pkl")

In [14]:
print('Data was extracted for {} episodes.'.format(simpsons_episodes.shape[0]))

Data was extracted for 683 episodes.


In [12]:
simpsons_episodes

Unnamed: 0,season,episode_no,title,airdate,rating,total_votes,desc
0,1,1,Simpsons Roasting on an Open Fire,2 Sep. 1990,8.2,5966,The family is forced to spend all of their sav...
1,1,2,Bart the Genius,14 Jan. 1990,7.7,3834,Bart ends up at a school for gifted children a...
2,1,3,Homer's Odyssey,21 Jan. 1990,7.4,3371,"After losing his job, Homer contemplates endin..."
3,1,4,There's No Disgrace Like Home,28 Jan. 1990,7.7,3301,After being embarrassed by the rest of the fam...
4,1,5,Bart the General,4 Feb. 1990,8.0,3353,After being beaten up by Nelson Muntz one too ...
5,1,6,Moaning Lisa,11 Feb. 1990,7.6,3246,A depressed Lisa's spirit is lifted when she m...
6,1,7,The Call of the Simpsons,2 Sep. 1990,7.8,3123,"Homer takes the family camping, but it soon be..."
7,1,8,The Telltale Head,25 Feb. 1990,7.7,3023,Bart gets more than he bargained for when he s...
8,1,9,Life on the Fast Lane,18 Mar. 1990,7.5,3014,Marge contemplates an affair with a handsome b...
9,1,10,Homer's Night Out,25 Mar. 1990,7.4,2904,After a photograph of Homer canoodling with an...
