## Analysing Personal 

https://www.dataquest.io/blog/python-tutorial-analyze-personal-netflix-data/

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:

df = pd.read_csv('ViewingActivity.csv')

In [3]:
df.shape

(24520, 10)

In [4]:
df.head()

Unnamed: 0,Profile Name,Start Time,Duration,Attributes,Title,Supplemental Video Type,Device Type,Bookmark,Latest Bookmark,Country
0,Karina,2024-02-24 19:17:21,00:03:42,,Gilmore Girls: Season 4: Nag Hammadi Is Where ...,,Android DefaultWidevineL3Phone Android Phone,00:04:24,00:04:24,NZ (New Zealand)
1,Karina,2024-02-24 18:34:32,00:42:48,,Gilmore Girls: Season 4: A Family Matter (Epis...,,Android DefaultWidevineL3Phone Android Phone,00:43:23,00:43:23,NZ (New Zealand)
2,Karina,2024-02-24 18:25:53,00:00:36,Autoplayed: user action: None;,Season 6 Teaser: Formula 1: Drive to Survive,TEASER_TRAILER,Chrome PC (Cadmium),00:00:00,00:00:00,NZ (New Zealand)
3,Karina,2024-02-22 18:37:46,00:00:46,,Gilmore Girls: Season 4: A Family Matter (Epis...,,Android DefaultWidevineL3Phone Android Phone,00:00:46,Not latest view,NZ (New Zealand)
4,Karina,2024-02-22 18:06:46,00:30:56,,Gilmore Girls: Season 4: In the Clamor and the...,,Android DefaultWidevineL3Phone Android Phone,00:43:16,00:43:16,NZ (New Zealand)


### Cleaning Data

In [5]:
# user puts in their name
name = input('Whose data would you like to analyse? ').title()

#filter by the user's name
df = df[df['Profile Name'] == name]

Whose data would you like to analyse? Karina


In [6]:
for i in df.columns:
    print('*'*50, '\n',i.upper(), '\n', df[i].unique(), '\n')

************************************************** 
 PROFILE NAME 
 ['Karina'] 

************************************************** 
 START TIME 
 ['2024-02-24 19:17:21' '2024-02-24 18:34:32' '2024-02-24 18:25:53' ...
 '2018-02-17 22:56:25' '2018-02-17 22:22:21' '2018-02-17 21:27:35'] 

************************************************** 
 DURATION 
 ['00:03:42' '00:42:48' '00:00:36' ... '00:53:50' '00:57:00' '00:54:42'] 

************************************************** 
 ATTRIBUTES 
 [nan 'Autoplayed: user action: None; '
 'Autoplayed: user action: User_Interaction; '
 'Autoplayed: user action: Unspecified; '] 

************************************************** 
 TITLE 
 ['Gilmore Girls: Season 4: Nag Hammadi Is Where They Found the Gnostic Gospels (Episode 13)'
 'Gilmore Girls: Season 4: A Family Matter (Episode 12)'
 'Season 6 Teaser: Formula 1: Drive to Survive' ...
 'The Crown: Season 1: Windsor (Episode 3)'
 'The Crown: Season 1: Hyde Park Corner (Episode 2)'
 'The Crown: Seas

In [7]:
df['Supplemental Video Type'].unique()

array([nan, 'TEASER_TRAILER', 'TRAILER', 'HOOK', 'CINEMAGRAPH', 'RECAP',
       'PROMOTIONAL', 'PREVIEW'], dtype=object)

In [8]:
# Netflix data includes all the little trailers played so filtering out everything except for proper viewings
df = df[df[ 'Supplemental Video Type'].isna()]

### Dropping loads of boring columns

In [9]:
df = df.drop(['Profile Name', 'Attributes', 'Supplemental Video Type', 'Device Type', 'Bookmark', 'Latest Bookmark', 'Country'], axis=1)
df.head(1)

Unnamed: 0,Start Time,Duration,Title
0,2024-02-24 19:17:21,00:03:42,Gilmore Girls: Season 4: Nag Hammadi Is Where ...


In [10]:
df.dtypes

Start Time    object
Duration      object
Title         object
dtype: object

### Convert Start Time to datetime

In [11]:
df['Start Time'] = pd.to_datetime(df['Start Time']) # not setting a timezone
# df['Start Time'] = pd.to_datetime(df['Start Time'], utc=True)
df.dtypes

Start Time    datetime64[ns]
Duration              object
Title                 object
dtype: object

In [12]:
# # change the Start Time column into the dataframe's index
# df = df.set_index('Start Time')

# # convert from UTC timezone to eastern time
# df.index = df.index.tz_convert('US/Eastern')

# # reset the index so that Start Time becomes a column again
# df = df.reset_index()

# #double-check that it worked
# df.head(1)

In [13]:
df['Duration'] = pd.to_timedelta(df['Duration'])
df.dtypes

Start Time     datetime64[ns]
Duration      timedelta64[ns]
Title                  object
dtype: object

### Creating new variable from Title: Movie or Season?

In [14]:
# Check if 'title' column contains 'seasons' and assign 'tv show' or 'movie' accordingly
# update according to conditions
df.loc[df['Title'].str.contains('Season'), 'Type'] = 'TV Show'
# df['Type'] =  [df['Type'].fillna('Movie')]
print(df)

               Start Time        Duration  \
0     2024-02-24 19:17:21 0 days 00:03:42   
1     2024-02-24 18:34:32 0 days 00:42:48   
3     2024-02-22 18:37:46 0 days 00:00:46   
4     2024-02-22 18:06:46 0 days 00:30:56   
5     2024-02-22 18:05:08 0 days 00:00:44   
...                   ...             ...   
16478 2018-02-18 00:25:30 0 days 00:56:25   
16479 2018-02-17 23:28:28 0 days 00:57:00   
16480 2018-02-17 22:56:25 0 days 00:31:03   
16481 2018-02-17 22:22:21 0 days 00:28:27   
16482 2018-02-17 21:27:35 0 days 00:54:42   

                                                   Title     Type  
0      Gilmore Girls: Season 4: Nag Hammadi Is Where ...  TV Show  
1      Gilmore Girls: Season 4: A Family Matter (Epis...  TV Show  
3      Gilmore Girls: Season 4: A Family Matter (Epis...  TV Show  
4      Gilmore Girls: Season 4: In the Clamor and the...  TV Show  
5      Gilmore Girls: Season 4: In the Clamor and the...  TV Show  
...                                                

### Splitting the df for  Movie and TV Show

In [15]:
movie_df = df[df['Type'] == 'Movie']
tvshow_df = df[df['Type'] == 'TV Show']

In [16]:
movie_df.head()

Unnamed: 0,Start Time,Duration,Title,Type


In [17]:
tvshow_df.head()

Unnamed: 0,Start Time,Duration,Title,Type
0,2024-02-24 19:17:21,0 days 00:03:42,Gilmore Girls: Season 4: Nag Hammadi Is Where ...,TV Show
1,2024-02-24 18:34:32,0 days 00:42:48,Gilmore Girls: Season 4: A Family Matter (Epis...,TV Show
3,2024-02-22 18:37:46,0 days 00:00:46,Gilmore Girls: Season 4: A Family Matter (Epis...,TV Show
4,2024-02-22 18:06:46,0 days 00:30:56,Gilmore Girls: Season 4: In the Clamor and the...,TV Show
5,2024-02-22 18:05:08,0 days 00:00:44,Gilmore Girls: Season 4: In the Clamor and the...,TV Show


## Cleaning the TV Shows Titles

In [18]:
# dropping Type from columns
tvshow_df = tvshow_df[[
    'Start Time',
    'Duration',
    'Title'
]]

tvshow_df

Unnamed: 0,Start Time,Duration,Title
0,2024-02-24 19:17:21,0 days 00:03:42,Gilmore Girls: Season 4: Nag Hammadi Is Where ...
1,2024-02-24 18:34:32,0 days 00:42:48,Gilmore Girls: Season 4: A Family Matter (Epis...
3,2024-02-22 18:37:46,0 days 00:00:46,Gilmore Girls: Season 4: A Family Matter (Epis...
4,2024-02-22 18:06:46,0 days 00:30:56,Gilmore Girls: Season 4: In the Clamor and the...
5,2024-02-22 18:05:08,0 days 00:00:44,Gilmore Girls: Season 4: In the Clamor and the...
...,...,...,...
16478,2018-02-18 00:25:30,0 days 00:56:25,The Crown: Season 1: Act of God (Episode 4)
16479,2018-02-17 23:28:28,0 days 00:57:00,The Crown: Season 1: Windsor (Episode 3)
16480,2018-02-17 22:56:25,0 days 00:31:03,The Crown: Season 1: Hyde Park Corner (Episode 2)
16481,2018-02-17 22:22:21,0 days 00:28:27,The Crown: Season 1: Hyde Park Corner (Episode 2)


In [19]:
#stripping out the title 
# all titles include name of the episode:
# 'Gilmore Girls: Season 4: Nag Hammadi Is Where...'

tvshow_df['Title'] = tvshow_df['Title'].str.split(':', n=1).str[0]

In [20]:
# Extract year from the 'Start Time' column
tvshow_df['Year'] = tvshow_df['Start Time'].dt.year

# Count occurrences of each TV show for each year
# result = tvshow_df.groupby(['Title', 'Year']).size().reset_index()

# Sum up duration of each TV show for each year
result = tvshow_df.groupby(['Title', 'Year'])['Duration'].sum().reset_index()
result

Unnamed: 0,Title,Year,Duration
0,(Un)Well,2020,0 days 02:51:02
1,13 Reasons Why,2018,1 days 09:54:19
2,13 Reasons Why,2019,0 days 03:24:37
3,13 Reasons Why,2020,0 days 00:13:04
4,13 Reasons Why,2021,0 days 06:05:30
...,...,...,...
208,iZombie,2018,2 days 12:09:12
209,iZombie,2019,0 days 10:15:11
210,iZombie,2020,2 days 03:22:59
211,iZombie,2021,0 days 07:13:28


In [21]:
top_15_tv_show_for_each_year = []

## Extracting TOP 15 TV SHows for each year
for year in result['Year'].unique():
    result_year = result[result['Year'] == year].sort_values(by='Duration', ascending = False)[:15].reset_index()
#     print('*'*50, '\n', year,'\n', result_year, '\n')
    top_15_tv_show_for_each_year.append(result_year) 
    
# Concat the dataframes out of the list
top_15_tv_show_for_each_year = pd.concat(top_15_tv_show_for_each_year, axis=0)

# Ordering for the chart 
top_15_tv_show_for_each_year = top_15_tv_show_for_each_year.sort_values(by=['Year', 'Duration'], ascending=[True, False])


In [22]:
# Find the longest title
longest_title_length = top_15_tv_show_for_each_year['Title'].apply(len).max()
print("Longest title length:", longest_title_length)


Longest title length: 37


In [25]:
max_title_count = top_15_tv_show_for_each_year['Duration'].max()


fig = px.bar(top_15_tv_show_for_each_year, 
             y = "Title", 
             x = "Duration", 
#              color_discrete_sequence=["#EF553B"],
             animation_frame="Year",
#                           animation_group="Year", 
             orientation='h', # Set orientation to vertical

             range_x=[0, max_title_count],
#              range_y=[0, longest_title_length],
             labels={"y": "Title", "x": "Duration"},
             title="How has my viewings changed? ",
             
#             category_orders={"Year": top_15_tv_show_for_each_year['Year'].unique() }
            )

# Set the frame duration to 1000 milliseconds (1 second) per frame for slower animation
fig.layout.updatemenus[0].buttons[0].args[1]['frame']['duration'] = 2000



fig.show()


In [26]:
df.to_csv('Karina_viewings.csv')