In [None]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import date

In [None]:
wiki_url = 'https://en.wikipedia.org/wiki/The_Simpsons_(season_33)'

In [None]:
imdb_url = 'https://www.imdb.com/title/tt0096697/episodes?season=33'

# Wikipedia

In [None]:
req=requests.get(wiki_url)
content=req.text

In [None]:
soup=BeautifulSoup(content)

In [None]:
rows=soup.findAll('tr', 'vevent')

In [None]:
rows[1]

In [None]:
rows[0].findAll('th')[0].text

In [None]:
data = rows[9].findAll('td')

In [None]:
data

In [None]:
data[6].text.split('[')[0]

In [None]:
"2.02".split('\[')

In [None]:
data[4].findAll('span')[1].text

# IMDB

In [None]:
imdb_req=requests.get(imdb_url)
imdb_content=imdb_req.text

In [None]:
imdb_soup=BeautifulSoup(imdb_content)

In [None]:
descriptions = imdb_soup.findAll('div', 'item_description')

In [None]:
descriptions[0].text.split('\n')[1]

In [None]:
rows=imdb_soup.findAll('div', 'ipl-rating-star small')

In [None]:
len(rows)

In [None]:
rows[1].findAll('span', 'ipl-rating-star__rating')[0].text

In [None]:
"\"hello\""

In [None]:
"\"hello\"".strip("\"")

# Rotten Tomatoes

In [None]:
rt_url = 'https://www.rottentomatoes.com/tv/the_simpsons/s02'

In [None]:
req=requests.get(rt_url)
rt_content=req.text

In [None]:
rt_soup=BeautifulSoup(rt_content)

In [None]:
ratings = rt_soup.findAll('span', 'mop-ratings-wrap__percentage')

In [None]:
ratings[0].text.strip()

In [None]:
#critic ratings
num_ratings = rt_soup.findAll('small', 'mop-ratings-wrap__text--small')

In [None]:
num_ratings[0].text.strip()

In [None]:
#user ratings
num_ratings = rt_soup.findAll('strong', 'mop-ratings-wrap__text--small')

In [None]:
num_ratings[1].text.strip().split(":")[1].strip()

# Data Construction

In [None]:
#initialize dataframes
episode_data = []

In [None]:
# For each season
seasons = list(range(1,35))
for season in seasons:
    print(season)
    wiki = f'https://en.wikipedia.org/wiki/The_Simpsons_(season_{season})'
    imdb = f'https://www.imdb.com/title/tt0096697/episodes?season={season}'
    if season < 10:
        rt_url = f'https://www.rottentomatoes.com/tv/the_simpsons/s0{season}'
    else:
        rt_url = f'https://www.rottentomatoes.com/tv/the_simpsons/s{season}'
    
    #wiki
    req=requests.get(wiki)
    content=req.text
    soup=BeautifulSoup(content)
    rows=soup.findAll('tr', 'vevent')
    
    #imdb
    imdb_req=requests.get(imdb)
    imdb_content=imdb_req.text
    imdb_soup=BeautifulSoup(imdb_content)
    imdb_rows=imdb_soup.findAll('div', 'ipl-rating-star small')
    descriptions = imdb_soup.findAll('div', 'item_description')
    

    
    for index in range(len(rows)):
        ep_row = {}
        ep_row['season'] = season
        data = rows[index].findAll('td')
        ep_row['number_in_series'] = rows[index].findAll('th')[0].text.split('[')[0]
        ep_row['number_in_season'] = data[0].text
        ep_row['title'] = data[1].text.strip("\"")
        ep_row['directed_by'] = data[2].text.split('[')[0]
        ep_row['written_by'] = data[3].text.split('[')[0]
        if len(data[4].findAll('span')) > 1:
            ep_row['original_air_date'] = data[4].findAll('span')[1].text
        ep_row['production_code'] = data[5].text.split("[")[0]
        if len(data[6].text.split('[')) > 0:
            ep_row['us_viewers_in_millions'] = data[6].text.split('[')[0]
        if index < len(descriptions):
            ep_row['description'] = descriptions[index].text.split('\n')[1]

        
        
        #need imdb rating
        if index < len(imdb_rows):
            if len(imdb_rows[index].findAll('span', 'ipl-rating-star__rating')) > 0:
                rating = imdb_rows[index].findAll('span', 'ipl-rating-star__rating')[0].text
                ep_row['imdb_rating'] = rating
        
        
        episode_data.append(ep_row)

        
        
episode_data = pd.DataFrame(episode_data, columns = ['title', 'description', 'original_air_date', 'production_code','directed_by', 'written_by', 'season', 'number_in_season', 'number_in_series', 'us_viewers_in_millions', 'imdb_rating'])         

In [None]:
rotten_data = []
for season in seasons:
    print(season)
    if season < 10:
        rt_url = f'https://www.rottentomatoes.com/tv/the_simpsons/s0{season}'
    else:
        rt_url = f'https://www.rottentomatoes.com/tv/the_simpsons/s{season}'
          
        
    #Rotten tomatoes
    rt_req=requests.get(rt_url)
    rt_content=rt_req.text
    rt_soup=BeautifulSoup(rt_content)
    scores = rt_soup.findAll('span', 'mop-ratings-wrap__percentage')
    num_critic_ratings = rt_soup.findAll('small', 'mop-ratings-wrap__text--small')
    num_user_ratings = rt_soup.findAll('strong', 'mop-ratings-wrap__text--small')
        
    ep_row = {}
    ep_row['season'] = season
        
    if len(scores) == 1:
        #no critic score
        ep_row['rt_critic_score'] = np.nan
        ep_row['rt_user_score'] = scores[0].text.strip()
    elif len(scores) == 2:
        ep_row['rt_critic_score'] = scores[0].text.strip()
        ep_row['rt_user_score'] = scores[1].text.strip()
    else:
        ep_row['rt_critic_score'] = np.nan
        ep_row['rt_user_score'] = np.nan
    ep_row['rt_critic_count'] = num_critic_ratings[0].text.strip()
    ep_row['rt_user_count'] = num_user_ratings[1].text.strip().split(":")[1].strip()
    
    
    rotten_data.append(ep_row)

rotten_df=pd.DataFrame(rotten_data, columns = ['season', 'rt_critic_score', 'rt_user_score', 'rt_critic_count', 'rt_user_count'])   

In [None]:
rotten_df.head()

In [None]:
rotten_df.tail()

In [None]:
rotten_df.to_csv('simpsons_rt_scores.csv', index=False)

In [None]:
#write data
episode_data.index.name = 'id'

In [None]:
episode_data.head()

In [None]:
episode_data.tail()

In [None]:
episode_data.to_csv('simpsons_episodes.csv')