In [1]:
import requests
import bs4
import pandas as pd
import numpy as np

## Get TV shows from Hulu and Disney+

#### Goal

- Use webscraping to get the tv shows
- Export each data frame as csv

#### General information:

- Hulu has 1,744 tv shows, 50 per page, 35 pages in total.

- Disney+ has 179 tv shows, 50 per page, 4 in total.


### 1. Define function to get tv shows and additional information

We will use webscarping to get the tv shows from the website reelgood.com.

Each page has up to 50 tv shows.

**NOTE: We are using the same function as in 01_get_tv_shows_df notebook.**

In [2]:
# Define function that takes the url and number of tv shows you want to get from it, default is 50 which is the maximum.

def get_tv_shows(url, no=50):
    
    # define empty variables
    shows = {}
    year = []
    rating = []
    imdb = []
    rotten_tom = []
    
    # get response and contents using beautiful soup
    resp = requests.get(url)
    soup = bs4.BeautifulSoup(resp.content, "html.parser")
    tv_titles = soup.find("table", class_="css-1179hly").find_all("td", class_="css-1u7zfla")
    extras = soup.find("table", class_="css-1179hly").find_all("td", class_="css-1u7zfla")
    
    # get list of tv shows
    shows["show"] = [tv_titles[i].find("a").contents[0] for i in range(no)]
    
    # get lists for years, rating, imdb and rotten tomatoes reviews
    for i in range(no):
        year.append(soup.find("table", class_="css-1179hly").find_all("td", class_="css-1u11l3y")[i*4+0].contents[0])
        rating.append(soup.find("table", class_="css-1179hly").find_all("td", class_="css-1u11l3y")[i*4+1].contents[0])
        imdb.append(soup.find("table", class_="css-1179hly").find_all("td", class_="css-1u11l3y")[i*4+2].contents[0])
        rotten_tom.append(soup.find("table", class_="css-1179hly").find_all("td", class_="css-1u11l3y")[i*4+3].contents[0])

    shows["year"] = year
    shows["rating"] = rating
    shows["imdb"] = imdb
    shows["rotten_tomatoes"] = rotten_tom
    
    return shows

## 2. Get tv shows for each streaming service using get_tv_shows function


## Hulu

In [3]:
url = "https://reelgood.com/tv/source/hulu"
pages = 0
hulu_shows = pd.DataFrame(columns=["show", "year", "rating", "imdb", "rotten_tomatoes"])

while pages < 1700:
    new_pd = pd.DataFrame(get_tv_shows(url))
    hulu_shows = hulu_shows.append(new_pd)
    pages +=50
    url = "https://reelgood.com/tv/source/hulu?offset=" + str(pages)

In [4]:
hulu_shows = hulu_shows.append(pd.DataFrame(get_tv_shows("https://reelgood.com/tv/source/hulu?offset=1700", 44)))


In [5]:
hulu_shows.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
0,Rick and Morty,2013,18+,9.2,94%
1,Fargo,2014,18+,8.9,96%
2,Vikings,2013,18+,8.5,93%
3,Brooklyn Nine-Nine,2013,16+,8.4,97%
4,Attack on Titan,2013,16+,8.8,94%


## Disney+

In [6]:
url = "https://reelgood.com/tv/source/disney_plus"
pages = 0
disney_shows = pd.DataFrame(columns=["show", "year", "rating", "imdb", "rotten_tomatoes"])

while pages < 150:
    new_pd = pd.DataFrame(get_tv_shows(url))
    disney_shows = disney_shows.append(new_pd)
    pages +=50
    url = "https://reelgood.com/tv/source/disney_plus?offset=" + str(pages)

In [7]:
disney_shows = disney_shows.append(pd.DataFrame(get_tv_shows("https://reelgood.com/tv/source/disney_plus?offset=150", 29)))


In [8]:
disney_shows.head()

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
0,The Mandalorian,2019,7+,8.7,93%
1,The Simpsons,1989,7+,8.7,85%
2,Gravity Falls,2012,7+,8.9,100%
3,Star Wars: The Clone Wars,2008,7+,8.2,93%
4,Marvel's Agent Carter,2015,7+,7.9,86%


## 3. Export data

In [9]:
# hulu_shows.to_csv("Data_Hulu_Disney/hulu_shows.csv", index_label=False)
# disney_shows.to_csv("Data_Hulu_Disney/disney_shows.csv", index_label=False)