In [1]:
# Setup

import pandas as pd
from pathlib import Path

# Import Datasets

tv_shows = Path("data/tv_shows.csv")
movies = Path("data/movies.csv")
price_history = Path("data/price_history.csv")

# Read Datasets

tv_data = pd.read_csv(tv_shows, encoding="utf-8")
movie_data = pd.read_csv(movies, encoding="utf-8")
price_data = pd.read_csv(price_history, encoding="utf-8")

In [2]:
# Clean up tv_data

# Drop index column

tv_data = tv_data.drop(["Unnamed: 0", "ID", "Type"], axis=1)

# Format ratings

tv_data["IMDb"] = tv_data["IMDb"].str.replace("/10", "").astype(float)
tv_data["Rotten Tomatoes"] = tv_data["Rotten Tomatoes"].str.replace("/100", "").astype(int)

tv_data.head(10)

Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+
0,Breaking Bad,2008,18+,9.4,100,1,0,0,0
1,Stranger Things,2016,16+,8.7,96,1,0,0,0
2,Attack on Titan,2013,18+,9.0,95,1,1,0,0
3,Better Call Saul,2015,18+,8.8,94,1,0,0,0
4,Dark,2017,16+,8.8,93,1,0,0,0
5,Avatar: The Last Airbender,2005,7+,9.3,93,1,0,1,0
6,Peaky Blinders,2013,18+,8.8,93,1,0,0,0
7,The Walking Dead,2010,18+,8.2,93,1,0,0,0
8,Black Mirror,2011,18+,8.8,92,1,0,0,0
9,The Queen's Gambit,2020,18+,8.6,92,1,0,0,0


In [3]:
# Clean up movie_data

# Drop index column

movie_data = movie_data.drop(["Unnamed: 0", "ID", "Type"], axis=1)

# Format ratings and drop null values

movie_data = movie_data.dropna(subset=["Rotten Tomatoes"])
movie_data["Rotten Tomatoes"] = movie_data["Rotten Tomatoes"].str.replace("/100", "").astype(int)

movie_data.head(10)

Unnamed: 0,Title,Year,Age,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+
0,The Irishman,2019,18+,98,1,0,0,0
1,Dangal,2016,7+,97,1,0,0,0
2,David Attenborough: A Life on Our Planet,2020,7+,95,1,0,0,0
3,Lagaan: Once Upon a Time in India,2001,7+,94,1,0,0,0
4,Roma,2018,18+,94,1,0,0,0
5,To All the Boys I've Loved Before,2018,13+,94,1,0,0,0
6,The Social Dilemma,2020,13+,93,1,0,0,0
7,Okja,2017,13+,92,1,0,0,0
8,The Ballad of Buster Scruggs,2018,16+,92,1,0,0,0
9,The Trial of the Chicago 7,2020,18+,92,1,0,0,0


In [4]:
# Combine tv and movie data

all_content = pd.concat([tv_data, movie_data], ignore_index=True)

# Drop IMDb column (not applicable to movies)

all_content = all_content.drop("IMDb", axis=1)

all_content.head(10)

Unnamed: 0,Title,Year,Age,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+
0,Breaking Bad,2008,18+,100,1,0,0,0
1,Stranger Things,2016,16+,96,1,0,0,0
2,Attack on Titan,2013,18+,95,1,1,0,0
3,Better Call Saul,2015,18+,94,1,0,0,0
4,Dark,2017,16+,93,1,0,0,0
5,Avatar: The Last Airbender,2005,7+,93,1,0,1,0
6,Peaky Blinders,2013,18+,93,1,0,0,0
7,The Walking Dead,2010,18+,93,1,0,0,0
8,Black Mirror,2011,18+,92,1,0,0,0
9,The Queen's Gambit,2020,18+,92,1,0,0,0


In [5]:
# Clean Up price_data

# Rename columns

price_data = price_data.rename(columns={
    "service": "Streaming Service",
    "date": "Date",
    "price": "Subscription Price"
})

# Format dates and prices

price_data["Date"] = pd.to_datetime(price_data["Date"], format="%b-%Y")
price_data["Date"] = price_data["Date"].dt.strftime("%-m/%Y")

price_data["Subscription Price"] = price_data["Subscription Price"].astype(float)

price_data.head(10)

Unnamed: 0,Streaming Service,Date,Subscription Price
0,Netflix,7/2011,7.99
1,Netflix,8/2011,7.99
2,Netflix,9/2011,7.99
3,Netflix,10/2011,7.99
4,Netflix,11/2011,7.99
5,Netflix,12/2011,7.99
6,Netflix,1/2012,7.99
7,Netflix,2/2012,7.99
8,Netflix,3/2012,7.99
9,Netflix,4/2012,7.99


In [7]:
# Price data with only Netflix, Prime Video, Hulu, and Disney+

top_service_pricing = price_data.copy()

# Remove other streaming services

top_service_pricing = top_service_pricing[
    (top_service_pricing["Streaming Service"] == "Netflix") |
    (top_service_pricing["Streaming Service"] == "Prime Video") |
    (top_service_pricing["Streaming Service"] == "Hulu") |
    (top_service_pricing["Streaming Service"] == "Disney+")
]

top_service_pricing.head(10)

Unnamed: 0,Streaming Service,Date,Subscription Price
0,Netflix,7/2011,7.99
1,Netflix,8/2011,7.99
2,Netflix,9/2011,7.99
3,Netflix,10/2011,7.99
4,Netflix,11/2011,7.99
5,Netflix,12/2011,7.99
6,Netflix,1/2012,7.99
7,Netflix,2/2012,7.99
8,Netflix,3/2012,7.99
9,Netflix,4/2012,7.99


In [14]:
# DF for only Netflix content

netflix_content = all_content.copy()

netflix_content = netflix_content[netflix_content["Netflix"] == 1]
netflix_content = netflix_content.drop(["Netflix", "Prime Video", "Hulu", "Disney+"], axis=1)

netflix_content = netflix_content.reset_index(drop=True)

netflix_content.head(10)

Unnamed: 0,Title,Year,Age,Rotten Tomatoes
0,Breaking Bad,2008,18+,100
1,Stranger Things,2016,16+,96
2,Attack on Titan,2013,18+,95
3,Better Call Saul,2015,18+,94
4,Dark,2017,16+,93
5,Avatar: The Last Airbender,2005,7+,93
6,Peaky Blinders,2013,18+,93
7,The Walking Dead,2010,18+,93
8,Black Mirror,2011,18+,92
9,The Queen's Gambit,2020,18+,92


In [15]:
# DF for only Prime Video content

prime_content = all_content.copy()

prime_content = prime_content[prime_content["Prime Video"] == 1]
prime_content = prime_content.drop(["Netflix", "Prime Video", "Hulu", "Disney+"], axis=1)

prime_content = prime_content.reset_index(drop=True)

prime_content.head(10)

Unnamed: 0,Title,Year,Age,Rotten Tomatoes
0,Avatar: The Last Airbender,2005,7+,93
1,Community,2009,7+,90
2,Shameless,2011,18+,90
3,American Horror Story,2011,18+,86
4,Star Trek: The Next Generation,1987,7+,86
5,Penny Dreadful,2014,18+,85
6,Downton Abbey,2010,16+,83
7,Star Trek,1966,7+,82
8,Star Trek: Deep Space Nine,1993,7+,80
9,Dark Matter,2015,18+,78


In [16]:
# DF for only Hulu content

hulu_content = all_content.copy()

hulu_content = hulu_content[hulu_content["Hulu"] == 1]
hulu_content = hulu_content.drop(["Netflix", "Prime Video", "Hulu", "Disney+"], axis=1)

hulu_content = hulu_content.reset_index(drop=True)

hulu_content.head(10)

Unnamed: 0,Title,Year,Age,Rotten Tomatoes
0,Attack on Titan,2013,18+,95
1,Community,2009,7+,90
2,Shameless,2011,18+,90
3,Fullmetal Alchemist: Brotherhood,2009,16+,88
4,Grey's Anatomy,2005,16+,87
5,Arrested Development,2003,16+,87
6,Demon Slayer: Kimetsu no Yaiba,2019,18+,86
7,American Horror Story,2011,18+,86
8,Star Trek: The Next Generation,1987,7+,86
9,One Piece,1999,7+,85


In [17]:
# DF for only Disney+ content

disney_content = all_content.copy()

disney_content = disney_content[disney_content["Disney+"] == 1]
disney_content = disney_content.drop(["Netflix", "Prime Video", "Hulu", "Disney+"], axis=1)

disney_content = disney_content.reset_index(drop=True)

disney_content.head(10)

Unnamed: 0,Title,Year,Age,Rotten Tomatoes
0,Beauty and the Beast,2012,7+,69
1,PJ Masks,2015,all,55
2,X-Men,2011,16+,53
3,Gigantosaurus,2019,all,48
4,Inspector Gadget,2015,all,40
5,The Jungle Book,2010,all,39
6,The Simpsons,1989,7+,91
7,Gravity Falls,2012,7+,88
8,Cosmos,2014,all,82
9,Marvel's Runaways,2017,16+,76
