## Hulu Viewing Habits

In [14]:
# Import Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from functools import reduce

In [15]:
# Read CSV files in
hulu = pd.read_csv("data/HuluViewingHistoryUpdated.csv")
streaming_services = pd.read_csv("data/tv_shows.csv")
streaming_data = pd.read_csv("data/titles.csv")

In [16]:
hulu.head()

Unnamed: 0,Episode Name,Series Name,Season,Last Played At
0,I Know Who Did It,Only Murders in the Building,2.0,10/30/2022 22:18
1,Sparring Partners,Only Murders in the Building,2.0,10/30/2022 21:39
2,"Hello, Darkness",Only Murders in the Building,2.0,10/30/2022 21:03
3,Flipping the Pieces,Only Murders in the Building,2.0,10/30/2022 5:48
4,Performance Review,Only Murders in the Building,2.0,10/30/2022 5:12


In [17]:
hulu["Streaming Service"] = "Hulu"

hulu.head()

Unnamed: 0,Episode Name,Series Name,Season,Last Played At,Streaming Service
0,I Know Who Did It,Only Murders in the Building,2.0,10/30/2022 22:18,Hulu
1,Sparring Partners,Only Murders in the Building,2.0,10/30/2022 21:39,Hulu
2,"Hello, Darkness",Only Murders in the Building,2.0,10/30/2022 21:03,Hulu
3,Flipping the Pieces,Only Murders in the Building,2.0,10/30/2022 5:48,Hulu
4,Performance Review,Only Murders in the Building,2.0,10/30/2022 5:12,Hulu


In [18]:
# Drop columns that aren't needed
hulu = hulu.drop(columns=["Season"], axis=1)

# View updated df
hulu.head()

Unnamed: 0,Episode Name,Series Name,Last Played At,Streaming Service
0,I Know Who Did It,Only Murders in the Building,10/30/2022 22:18,Hulu
1,Sparring Partners,Only Murders in the Building,10/30/2022 21:39,Hulu
2,"Hello, Darkness",Only Murders in the Building,10/30/2022 21:03,Hulu
3,Flipping the Pieces,Only Murders in the Building,10/30/2022 5:48,Hulu
4,Performance Review,Only Murders in the Building,10/30/2022 5:12,Hulu


In [19]:
# Fix the column names in the dataframe
fixed_columns = {
    "Series Name":"Title",
    "Last Played At":"Date Watched"
}

# Check column names are displaying correctly
hulu.rename(columns=fixed_columns, inplace=True)
hulu.head()

Unnamed: 0,Episode Name,Title,Date Watched,Streaming Service
0,I Know Who Did It,Only Murders in the Building,10/30/2022 22:18,Hulu
1,Sparring Partners,Only Murders in the Building,10/30/2022 21:39,Hulu
2,"Hello, Darkness",Only Murders in the Building,10/30/2022 21:03,Hulu
3,Flipping the Pieces,Only Murders in the Building,10/30/2022 5:48,Hulu
4,Performance Review,Only Murders in the Building,10/30/2022 5:12,Hulu


In [20]:
hulu.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 427 entries, 0 to 426
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Episode Name       427 non-null    object
 1   Title              426 non-null    object
 2   Date Watched       367 non-null    object
 3   Streaming Service  427 non-null    object
dtypes: object(4)
memory usage: 13.5+ KB


In [21]:
streaming_services.head()

Unnamed: 0.1,Unnamed: 0,ID,Title,Year,Age,IMDb,Rotten Tomatoes,Netflix,Hulu,Prime Video,Disney+,Type
0,0,1,Breaking Bad,2008,18+,9.4/10,100/100,1,0,0,0,1
1,1,2,Stranger Things,2016,16+,8.7/10,96/100,1,0,0,0,1
2,2,3,Attack on Titan,2013,18+,9.0/10,95/100,1,1,0,0,1
3,3,4,Better Call Saul,2015,18+,8.8/10,94/100,1,0,0,0,1
4,4,5,Dark,2017,16+,8.8/10,93/100,1,0,0,0,1


In [22]:
# Drop columns that aren't needed
streaming_services = streaming_services.drop(columns=["Unnamed: 0", "ID", "Netflix", "Hulu", "Prime Video", "Disney+", "Type"], axis=1)

# View updated df
streaming_services.head()

Unnamed: 0,Title,Year,Age,IMDb,Rotten Tomatoes
0,Breaking Bad,2008,18+,9.4/10,100/100
1,Stranger Things,2016,16+,8.7/10,96/100
2,Attack on Titan,2013,18+,9.0/10,95/100
3,Better Call Saul,2015,18+,8.8/10,94/100
4,Dark,2017,16+,8.8/10,93/100


In [26]:
hulu_streaming = pd.merge(hulu, streaming_services, how="inner", on="Title")

hulu_streaming.head()

Unnamed: 0,Episode Name,Title,Date Watched,Streaming Service,Year,Age,IMDb,Rotten Tomatoes
0,Morning,The Handmaid's Tale,10/25/2022 19:09,Hulu,2017,18+,8.4/10,90/100
1,The Wilderness,The Handmaid's Tale,11/17/2021 3:10,Hulu,2017,18+,8.4/10,90/100
2,Dark Shadows,Dark Shadows,10/19/2022 19:51,Hulu,1966,7+,7.5/10,62/100
3,Hotel Transylvania,Hotel Transylvania,10/17/2022 14:49,Hulu,2017,7+,5.5/10,43/100
4,Hotel Transylvania,Hotel Transylvania,10/16/2022 18:13,Hulu,2017,7+,5.5/10,43/100


In [27]:
hulu_streaming.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142 entries, 0 to 141
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Episode Name       142 non-null    object
 1   Title              142 non-null    object
 2   Date Watched       116 non-null    object
 3   Streaming Service  142 non-null    object
 4   Year               142 non-null    int64 
 5   Age                135 non-null    object
 6   IMDb               142 non-null    object
 7   Rotten Tomatoes    142 non-null    object
dtypes: int64(1), object(7)
memory usage: 10.0+ KB


In [31]:
top_shows = hulu_streaming.Title.value_counts()

top_shows.head()

This Is Us        18
Home Economics    15
Blossom           15
Grey's Anatomy    11
Fargo             11
Name: Title, dtype: int64