In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from bs4 import BeautifulSoup
import requests
from datetime import date

In [2]:
wiki_url = 'https://en.wikipedia.org/wiki/The_Simpsons_(season_33)'

In [3]:
imdb_url = 'https://www.imdb.com/title/tt0096697/episodes?season=33'

# Wikipedia

In [4]:
req=requests.get(wiki_url)
content=req.text

In [5]:
soup=BeautifulSoup(content)

In [6]:
rows=soup.findAll('tr', 'vevent')

In [7]:
rows[1]

<tr class="vevent" style="text-align:center;background:#F2F2F2"><th id="ep708" rowspan="1" scope="row" style="text-align:center">708</th><td style="text-align:center">2</td><td class="summary" style="text-align:left">"<a href="/wiki/Bart%27s_in_Jail!" title="Bart's in Jail!">Bart's in Jail!</a>"</td><td style="text-align:center"><a href="/wiki/Steven_Dean_Moore" title="Steven Dean Moore">Steven Dean Moore</a></td><td style="text-align:center">Nick Dahan</td><td style="text-align:center">October 3, 2021<span style="display:none"> (<span class="bday dtstart published updated">2021-10-03</span>)</span></td><td id="pcQABF18" style="text-align:center">QABF18</td><td style="text-align:center">1.48<sup class="reference" id="cite_ref-33.02_14-0"><a href="#cite_note-33.02-14">[14]</a></sup></td></tr>

In [8]:
rows[0].findAll('th')[0].text

'707'

In [9]:
data = rows[9].findAll('td')

In [10]:
data

[<td style="text-align:center">11</td>,
 <td class="summary" style="text-align:left">"The Longest Marge"</td>,
 <td style="text-align:center">Matthew Nastuk</td>,
 <td style="text-align:center"><a href="/wiki/Brian_Kelley_(writer)" title="Brian Kelley (writer)">Brian Kelley</a></td>,
 <td style="text-align:center">January 2, 2022<span style="display:none"> (<span class="bday dtstart published updated">2022-01-02</span>)</span></td>,
 <td id="pcUABF05" style="text-align:center">UABF05</td>,
 <td style="text-align:center">2.02<sup class="reference" id="cite_ref-33.11_23-0"><a href="#cite_note-33.11-23">[23]</a></sup></td>]

In [11]:
data[6].text.split('[')[0]

'2.02'

In [12]:
"2.02".split('\[')

['2.02']

In [13]:
data[4].findAll('span')[1].text

'2022-01-02'

# IMDB

In [14]:
imdb_req=requests.get(imdb_url)
imdb_content=imdb_req.text

In [15]:
imdb_soup=BeautifulSoup(imdb_content)

In [16]:
descriptions = imdb_soup.findAll('div', 'item_description')

In [17]:
descriptions[0].text.split('\n')[1]

'Marge stages a revival of a musical from high school, but her pleasant memories are threatened by the return of her old rival.    '

In [18]:
rows=imdb_soup.findAll('div', 'ipl-rating-star small')

In [19]:
len(rows)

22

In [20]:
rows[1].findAll('span', 'ipl-rating-star__rating')[0].text

'7.0'

In [21]:
"\"hello\""

'"hello"'

In [22]:
"\"hello\"".strip("\"")

'hello'

# Rotten Tomatoes

In [23]:
rt_url = 'https://www.rottentomatoes.com/tv/the_simpsons/s02'

In [24]:
req=requests.get(rt_url)
rt_content=req.text

In [25]:
rt_soup=BeautifulSoup(rt_content)

In [26]:
ratings = rt_soup.findAll('span', 'mop-ratings-wrap__percentage')

In [27]:
ratings[0].text.strip()

'100%'

In [28]:
#critic ratings
num_ratings = rt_soup.findAll('small', 'mop-ratings-wrap__text--small')

In [29]:
num_ratings[0].text.strip()

'8'

In [30]:
#user ratings
num_ratings = rt_soup.findAll('strong', 'mop-ratings-wrap__text--small')

In [31]:
num_ratings[1].text.strip().split(":")[1].strip()

'257'

# Data Construction

In [32]:
#initialize dataframes
episode_data = []

In [33]:
# For each season
seasons = list(range(1,35))
for season in seasons:
    print(season)
    wiki = f'https://en.wikipedia.org/wiki/The_Simpsons_(season_{season})'
    imdb = f'https://www.imdb.com/title/tt0096697/episodes?season={season}'
    if season < 10:
        rt_url = f'https://www.rottentomatoes.com/tv/the_simpsons/s0{season}'
    else:
        rt_url = f'https://www.rottentomatoes.com/tv/the_simpsons/s{season}'
    
    #wiki
    req=requests.get(wiki)
    content=req.text
    soup=BeautifulSoup(content)
    rows=soup.findAll('tr', 'vevent')
    
    #imdb
    imdb_req=requests.get(imdb)
    imdb_content=imdb_req.text
    imdb_soup=BeautifulSoup(imdb_content)
    imdb_rows=imdb_soup.findAll('div', 'ipl-rating-star small')
    descriptions = imdb_soup.findAll('div', 'item_description')
    

    
    for index in range(len(rows)):
        ep_row = {}
        ep_row['season'] = season
        data = rows[index].findAll('td')
        ep_row['number_in_series'] = rows[index].findAll('th')[0].text.split('[')[0]
        ep_row['number_in_season'] = data[0].text
        ep_row['title'] = data[1].text.strip("\"")
        ep_row['directed_by'] = data[2].text.split('[')[0]
        ep_row['written_by'] = data[3].text.split('[')[0]
        if len(data[4].findAll('span')) > 1:
            ep_row['original_air_date'] = data[4].findAll('span')[1].text
        ep_row['production_code'] = data[5].text.split("[")[0]
        if len(data[6].text.split('[')) > 0:
            ep_row['us_viewers_in_millions'] = data[6].text.split('[')[0]
        if index < len(descriptions):
            ep_row['description'] = descriptions[index].text.split('\n')[1]

        
        
        #need imdb rating
        if index < len(imdb_rows):
            if len(imdb_rows[index].findAll('span', 'ipl-rating-star__rating')) > 0:
                rating = imdb_rows[index].findAll('span', 'ipl-rating-star__rating')[0].text
                ep_row['imdb_rating'] = rating
        
        
        episode_data.append(ep_row)

        
        
episode_data = pd.DataFrame(episode_data, columns = ['title', 'description', 'original_air_date', 'production_code','directed_by', 'written_by', 'season', 'number_in_season', 'number_in_series', 'us_viewers_in_millions', 'imdb_rating'])         

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


In [34]:
rotten_data = []
for season in seasons:
    print(season)
    if season < 10:
        rt_url = f'https://www.rottentomatoes.com/tv/the_simpsons/s0{season}'
    else:
        rt_url = f'https://www.rottentomatoes.com/tv/the_simpsons/s{season}'
          
        
    #Rotten tomatoes
    rt_req=requests.get(rt_url)
    rt_content=rt_req.text
    rt_soup=BeautifulSoup(rt_content)
    scores = rt_soup.findAll('span', 'mop-ratings-wrap__percentage')
    num_critic_ratings = rt_soup.findAll('small', 'mop-ratings-wrap__text--small')
    num_user_ratings = rt_soup.findAll('strong', 'mop-ratings-wrap__text--small')
        
    ep_row = {}
    ep_row['season'] = season
        
    if len(scores) == 1:
        #no critic score
        ep_row['rt_critic_score'] = np.nan
        ep_row['rt_user_score'] = scores[0].text.strip()
    elif len(scores) == 2:
        ep_row['rt_critic_score'] = scores[0].text.strip()
        ep_row['rt_user_score'] = scores[1].text.strip()
    else:
        ep_row['rt_critic_score'] = np.nan
        ep_row['rt_user_score'] = np.nan
    ep_row['rt_critic_count'] = num_critic_ratings[0].text.strip()
    ep_row['rt_user_count'] = num_user_ratings[1].text.strip().split(":")[1].strip()
    
    
    rotten_data.append(ep_row)

rotten_df=pd.DataFrame(rotten_data, columns = ['season', 'rt_critic_score', 'rt_user_score', 'rt_critic_count', 'rt_user_count'])   

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34


In [35]:
rotten_df.head()

Unnamed: 0,season,rt_critic_score,rt_user_score,rt_critic_count,rt_user_count
0,1,100%,87%,18,366
1,2,100%,93%,8,257
2,3,100%,94%,5,242
3,4,100%,94%,11,234
4,5,100%,95%,5,233


In [36]:
rotten_df.tail()

Unnamed: 0,season,rt_critic_score,rt_user_score,rt_critic_count,rt_user_count
29,30,,51%,4,79
30,31,80%,54%,5,52
31,32,,49%,1,47
32,33,,59%,2,41
33,34,,76%,0,21


In [37]:
rotten_df.to_csv('simpsons_rt_scores.csv', index=False)

In [38]:
#write data
episode_data.index.name = 'id'

In [39]:
episode_data.head()

Unnamed: 0_level_0,title,description,original_air_date,production_code,directed_by,written_by,season,number_in_season,number_in_series,us_viewers_in_millions,imdb_rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Simpsons Roasting on an Open Fire,Homer is forced to become a department store S...,1989-12-17,7G08,David Silverman,Mimi Pond,1,1,1,26.7,8.1
1,Bart the Genius,Bart ends up at a school for gifted children a...,1990-01-14,7G02,David Silverman,Jon Vitti,1,2,2,24.5,7.7
2,Homer's Odyssey,"After losing his job, Homer contemplates endin...",1990-01-21,7G03,Wes Archer,Jay Kogen & Wallace Wolodarsky,1,3,3,27.5,7.3
3,There's No Disgrace Like Home,After being embarrassed by the rest of the fam...,1990-01-28,7G04,Gregg Vanzo & Kent Butterworth,Al Jean & Mike Reiss,1,4,4,20.2,7.7
4,Bart the General,After being beaten up by Nelson Muntz one too ...,1990-02-04,7G05,David Silverman,John Swartzwelder,1,5,5,27.1,7.9


In [40]:
episode_data.tail()

Unnamed: 0_level_0,title,description,original_air_date,production_code,directed_by,written_by,season,number_in_season,number_in_series,us_viewers_in_millions,imdb_rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
742,Fan-ily Feud,Know what this is about?,2023-03-26,OABF11,Timothy Bailey,Broti Gupta,34,18,746,TBD,
743,Write Off This Episode,Know what this is about?,2023-04-23,OABF12,TBA,J. Stewart Burns,34,19,747,TBD,
744,The Very Hungry Caterpillars,Know what this is about?,2023-04-30,OABF14,TBA,Brian Kelley,34,20,748,TBD,
745,Clown V. Board of Education,Know what this is about?,2023-05-07,OABF15,Lance Kramer,Jeff Westbrook,34,21,749,TBD,
746,Homer's Adventure Through the Windshield Glass,Know what this is about?,2023-05-14,OABF13,TBA,Tim Long,34,22,750,TBD,


In [41]:
episode_data.to_csv('simpsons_episodes.csv')