In [1]:
import requests
import bs4
import pandas as pd
import numpy as np

path = "C:/Users/Admin/Documents/ironhack/streaming_service_recommender/"

## Get TV shows from each streaming service

#### Goal

- Use webscraping to get the tv shows
- Export each data frame as csv

#### General information:

- Netflix has 1,915 tv shows, 50 per page, 39 pages in total.

- Prime has 2,136 tv shows, 50 per page, 43 in total.
- HBO has 200 tv shows, 50 per page, 4 pages in total.


### 1. Define function to get tv shows and additional information

We will use webscarping to get the tv shows from the website reelgood.com.

Each page has up to 50 tv shows.

In [2]:
# Define function that takes the url and number of tv shows you want to get from it, default is 50 which is the maximum.

def get_tv_shows(url, no=50):
    
    # define empty variables
    shows = {}
    year = []
    rating = []
    imdb = []
    rotten_tom = []
    
    # get response and contents using beautiful soup
    resp = requests.get(url)
    soup = bs4.BeautifulSoup(resp.content, "html.parser")
    tv_titles = soup.find("table", class_="css-1179hly").find_all("td", class_="css-1u7zfla")
    extras = soup.find("table", class_="css-1179hly").find_all("td", class_="css-1u7zfla")
    
    # get list of tv shows
    shows["show"] = [tv_titles[i].find("a").contents[0] for i in range(no)]
    
    # get lists for years, rating, imdb and rotten tomatoes reviews
    for i in range(no):
        year.append(soup.find("table", class_="css-1179hly").find_all("td", class_="css-1u11l3y")[i*4+0].contents[0])
        rating.append(soup.find("table", class_="css-1179hly").find_all("td", class_="css-1u11l3y")[i*4+1].contents[0])
        imdb.append(soup.find("table", class_="css-1179hly").find_all("td", class_="css-1u11l3y")[i*4+2].contents[0])
        rotten_tom.append(soup.find("table", class_="css-1179hly").find_all("td", class_="css-1u11l3y")[i*4+3].contents[0])

    shows["year"] = year
    shows["rating"] = rating
    shows["imdb"] = imdb
    shows["rotten_tomatoes"] = rotten_tom
    
    return shows

## 2. Get tv shows for each streaming service using get_tv_shows function

Next, we will create a while loop in order to run the function up until the number of tv shows in each and updating the url to change pages and get the next 50 tv shows. We can run this loop until the last number divisible by 50, do for the last page that has less than 50 tv shows, we will run an individual code cell and add it to the data frame.

Afterwards, each data frame will be exported to csv.

## Netflix

In [3]:
url = "https://reelgood.com/tv/source/netflix"
pages = 0
netflix_shows = pd.DataFrame(columns=["show", "year", "rating", "imdb", "rotten_tomatoes"])

while pages < 1900:
    new_pd = pd.DataFrame(get_tv_shows(url))
    netflix_shows = netflix_shows.append(new_pd)
    pages +=50
    url = "https://reelgood.com/tv/source/netflix?offset=" + str(pages)

In [4]:
netflix_shows

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
0,Breaking Bad,2008,18+,9.5,96%
1,Stranger Things,2016,16+,8.8,93%
2,Money Heist,2017,18+,8.4,91%
3,Sherlock,2010,16+,9.1,78%
4,Better Call Saul,2015,18+,8.7,97%
...,...,...,...,...,...
45,World At Your Feet,2014,,,
46,Sports Adventures,2000,,,
47,The Ultimatum,2012,,,
48,The Color of a Woman,2011,,,


In [5]:
netflix_shows = netflix_shows.append(pd.DataFrame(get_tv_shows("https://reelgood.com/tv/source/netflix?offset=1900", 15)))


In [6]:
netflix_shows = netflix_shows.reset_index(drop=True)

In [7]:
netflix_shows

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
0,Breaking Bad,2008,18+,9.5,96%
1,Stranger Things,2016,16+,8.8,93%
2,Money Heist,2017,18+,8.4,91%
3,Sherlock,2010,16+,9.1,78%
4,Better Call Saul,2015,18+,8.7,97%
...,...,...,...,...,...
1910,Truth,2012,,,
1911,JingleKids,2019,,,
1912,Murphy's Law of Love,2014,,,
1913,Taste of the Country,2011,,,


In [26]:
# netflix_shows.to_csv((path + "Data/netflix_shows.csv"), index_label=False)

## Amazon Prime

In [11]:
url = "https://reelgood.com/tv/source/amazon"
pages = 0
amazon_shows = pd.DataFrame(columns=["show", "year", "rating", "imdb", "rotten_tomatoes"])

while pages < 2100:
    new_pd = pd.DataFrame(get_tv_shows(url))
    amazon_shows = amazon_shows.append(new_pd)
    pages +=50
    url = "https://reelgood.com/tv/source/amazon?offset=" + str(pages)

In [13]:
amazon_shows = amazon_shows.append(pd.DataFrame(get_tv_shows("https://reelgood.com/tv/source/amazon?offset=2100", 36)))


In [15]:
amazon_shows = amazon_shows.reset_index(drop=True)

In [16]:
amazon_shows

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
0,The Wire,2002,18+,9.3,94%
1,The Sopranos,1999,18+,9.2,92%
2,Band of Brothers,2001,18+,9.4,94%
3,Vikings,2013,18+,8.6,93%
4,House,2004,16+,8.7,90%
...,...,...,...,...,...
2131,Dropping the Soap,2017,,,
2132,Strictly Dumpling,2017,,,
2133,Pinkfong! Dinosaur Songs,2014,,,
2134,Red Dust,2013,,,


In [27]:
# amazon_shows.to_csv((path + "Data/amazon_shows.csv"), index_label=False)

## HBO

In [21]:
url = "https://reelgood.com/tv/source/hbo"
pages = 0
hbo_shows = pd.DataFrame(columns=["show", "year", "rating", "imdb", "rotten_tomatoes"])

while pages < 200:
    new_pd = pd.DataFrame(get_tv_shows(url))
    hbo_shows = hbo_shows.append(new_pd)
    pages +=50
    url = "https://reelgood.com/tv/source/hbo?offset=" + str(pages)

In [22]:
hbo_shows = hbo_shows.reset_index(drop=True)

In [23]:
hbo_shows

Unnamed: 0,show,year,rating,imdb,rotten_tomatoes
0,Game of Thrones,2011,18+,9.3,89%
1,The Wire,2002,18+,9.3,94%
2,Chernobyl,2019,18+,9.4,96%
3,The Sopranos,1999,18+,9.2,92%
4,Band of Brothers,2001,18+,9.4,94%
...,...,...,...,...,...
195,Saving My Tomorrow,2014,,,
196,El Negocio,2014,,,
197,HBO Latino Presents: A Tiny Audience,2019,,,
198,Russell Simmons Presents Brave New Voices,2009,,6.2,


In [29]:
# hbo_shows.to_csv((path + "Data/hbo_shows.csv"), index_label=False)