# Scrape IMDb Ratings for The Walking Dead

In [1]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup

## Extract the unique seasons from the dropdown
Find out how many seasons there are

In [2]:
SAVE = False
URL = 'https://www.imdb.com/title/tt1520211/episodes'

In [3]:
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')
seasons = [int(s.text.strip()) for s in soup.find("select",{"id":"bySeason"}).findAll('option')]
print(seasons)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]


## Get ratings for each episode in each season

In [4]:
def extract_episode_info(episode):
    '''Extracts episode number, air date, title, rating, and total number of 
    votes from the episode HTML
    '''
    epinfo = episode.find('div', class_='info')    
    number = int(epinfo.find(itemprop="episodeNumber").get('content'))
    title = epinfo.find('a', itemprop='name').text
    
    airdate = epinfo.find('div', class_='airdate')
    if airdate is not None:
        airdate = airdate.text.strip()
    else:
        airdate = 'TBA'    
    
    rating = epinfo.find('span', class_='ipl-rating-star__rating')
    if rating is not None:
        rating = float(rating.text)
    
    total_votes = epinfo.find('span', class_='ipl-rating-star__total-votes')    
    if total_votes is not None:
        total_votes = int(re.search('\(([^)]+)', total_votes.text).group(1).replace(',', ''))

    return {'episode': number, 'airdate': airdate, 'title': title, 'rating': rating, 'sample': total_votes}

Loops over each season and stores all episode information in a pandas dataframe

In [5]:
frames = []

for season in seasons:
    response = requests.get(f'{URL}?season={season}')
    soup = BeautifulSoup(response.text, 'html.parser')
    eplist = soup.find("div", class_='list detail eplist') # get parent container of episode list
    eplist = eplist.find_all('div', class_='list_item') # get list of episodes

    episode_info = []
    for ep in eplist:
        epinfo = extract_episode_info(ep)
        epinfo['season'] = season
        episode_info.append( epinfo )
    
    frames.append( pd.DataFrame(episode_info) )

df = pd.concat( frames ).sort_values(by=['season', 'episode'])
df.insert(0, 'season', df.pop('season')) # move season to first column

In [6]:
df.head(10)

Unnamed: 0,season,episode,airdate,title,rating,sample
0,1,1,5 Nov. 2010,Days Gone Bye,9.2,24869.0
1,1,2,12 Nov. 2010,Guts,8.6,15699.0
2,1,3,19 Nov. 2010,Tell It to the Frogs,8.2,14385.0
3,1,4,26 Nov. 2010,Vatos,8.5,14065.0
4,1,5,3 Dec. 2010,Wildfire,8.1,13507.0
5,1,6,10 Dec. 2010,TS-19,8.6,14718.0
0,2,1,21 Oct. 2011,What Lies Ahead,8.5,13275.0
1,2,2,24 Oct. 2011,Bloodletting,7.9,12008.0
2,2,3,4 Nov. 2011,Save the Last One,8.3,12010.0
3,2,4,13 Nov. 2011,Cherokee Rose,7.5,11659.0


In [7]:
if SAVE:
    df.to_csv('twd_episode_ratings.csv', index=False)