# Netflix Data Analysis (Based on View Activity)


In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
sns.set()

In [2]:
df_activity = pd.read_csv('netflix-report/CONTENT_INTERACTION/ViewingActivity.csv')

In [3]:
df_activity.head()

Unnamed: 0,Profile Name,Start Time,Duration,Attributes,Title,Supplemental Video Type,Device Type,Bookmark,Latest Bookmark,Country
0,Harzzz,2022-07-25 08:22:45,00:15:13,,The Gray Man,,Android DefaultWidevineL3Phone Android Phone,00:27:51,00:27:51,IN (India)
1,Harzzz,2022-07-23 17:29:40,00:26:32,,Formula 1: Drive to Survive: Season 2: Seeing ...,,Chrome PC (Cadmium),00:26:37,00:26:37,IN (India)
2,Harzzz,2022-07-23 16:34:30,00:21:12,,Formula 1: Drive to Survive: Season 2: Raging ...,,Chrome PC (Cadmium),00:36:01,00:36:01,IN (India)
3,Harzzz,2022-07-23 16:33:37,00:00:06,Autoplayed: user action: None;,Coming Soon: Evergreen: Formula 1: Drive to Su...,TEASER_TRAILER,Chrome PC (Cadmium),00:00:06,00:00:06,IN (India)
4,Harzzz,2022-07-23 16:33:30,00:00:01,,Formula 1: Drive to Survive: Season 2: Boiling...,,Chrome PC (Cadmium),00:38:38,00:38:38,IN (India)


## Data Cleaning

In [7]:
df_my_activity = df_activity[df_activity['Profile Name'] == 'Kimzzz']

In [9]:
df_my_activity.head()

Unnamed: 0,Profile Name,Start Time,Duration,Attributes,Title,Supplemental Video Type,Device Type,Bookmark,Latest Bookmark,Country
4390,Kimzzz,2022-07-25 12:39:25,00:02:00,,Riverdale: Season 6: Chapter One Hundred and S...,,Android DefaultWidevineL3Phone Android Phone,00:01:59,00:01:59,IN (India)
4391,Kimzzz,2022-07-25 12:39:15,00:00:01,,Gotham: Season 1: Everyone Has a Cobblepot (Ep...,,Android DefaultWidevineL3Phone Android Phone,00:12:03,00:12:03,IN (India)
4392,Kimzzz,2022-07-25 09:24:49,00:08:24,,Gotham: Season 1: Everyone Has a Cobblepot (Ep...,,Android DefaultWidevineL3Phone Android Phone,00:12:06,Not latest view,IN (India)
4393,Kimzzz,2022-07-25 09:21:05,00:03:35,,Gotham: Season 1: Red Hood (Episode 17),,Android DefaultWidevineL3Phone Android Phone,00:42:43,00:42:43,IN (India)
4394,Kimzzz,2022-07-25 08:59:37,00:12:52,,Gotham: Season 1: Red Hood (Episode 17),,Android DefaultWidevineL3Phone Android Phone,00:30:04,Not latest view,IN (India)


In [10]:
df_my_activity = df_my_activity.drop(['Attributes', 'Device Type', 'Bookmark', 'Latest Bookmark', 'Country'], axis=1)

In [11]:
df_my_activity = df_my_activity.reset_index().drop(['index'], axis = 1)

In [12]:
df_my_activity['Supplemental Video Type'].value_counts()

HOOK              491
TRAILER           268
PROMOTIONAL       134
TEASER_TRAILER     48
PREVIEW            19
RECAP               7
BUMPER              3
TUTORIAL            1
Name: Supplemental Video Type, dtype: int64

In [13]:
df_my_activity = df_my_activity[df_my_activity['Supplemental Video Type'].isna()]

In [14]:
df_my_activity.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2014 entries, 0 to 2984
Data columns (total 5 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Profile Name             2014 non-null   object
 1   Start Time               2014 non-null   object
 2   Duration                 2014 non-null   object
 3   Title                    2014 non-null   object
 4   Supplemental Video Type  0 non-null      object
dtypes: object(5)
memory usage: 94.4+ KB


In [15]:
df_my_activity['Duration_Seconds'] = df_my_activity['Duration'].str.split(':').apply(lambda x: int(x[0])*3600 + int(x[1])*60 + int(x[2]))

In [16]:
df_my_activity = df_my_activity.rename(columns = {'Start Time':'c'})

In [None]:
df_my_activity['Start Time'] = pd.to_datetime(df_my_activity['Start Time'])
df_my_activity['Duration'] = pd.to_timedelta(df_my_activity['Duration'])

In [None]:
df_my_activity.dtypes

In [None]:
df_my_activity['Year'], df_my_activity['Month'] = df_my_activity['Date'].dt.year, df_my_activity['Date'].dt.month_name()
df_my_activity['Day'] = df_my_activity['Date'].dt.day
df_my_activity['Day_of_week'] = df_my_activity['Date'].dt.day_name()

In [None]:
df_my_activity.head()

In [None]:
show_details = df_my_activity['Title'].str.split(":", expand=True, n=2)
#show_details
df_my_activity['Show_name'] = show_details[0]
df_my_activity['Season'] = show_details[1]
df_my_activity['Episode_name'] = show_details[2]

In [None]:
df_my_activity.head()

In [None]:
df_my_activity['Show_type'] = df_my_activity.apply(lambda x:'Movie' if pd.isnull(x['Season']) else 'TV Show' , axis=1)

In [None]:
df_my_activity.head(1)

## Analysis based on Frequency

In [None]:
plt.figure(figsize=(10,5))
plt.title("Day of the month - Monthly viewing distribution")
ax = sns.histplot(df_my_activity['Day'],
                  bins=31,
                  color="blue")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
ax = sns.barplot(x = df_my_activity["Day_of_week"].value_counts().index, 
                 y = df_my_activity["Day_of_week"].value_counts(), 
                 palette="Blues_d")
plt.title("Day of the week - Weekly viewing distribution")
plt.xlabel("Day of the Week")
plt.ylabel("Frequency")
plt.show()

In [None]:
show_freq = df_my_activity.groupby('Show_type')['Day_of_week'].value_counts().sort_values(ascending=False).reset_index(name='Count')
#show_freq
plt.figure(figsize=(10,5))
ax = sns.barplot(x = show_freq["Day_of_week"], 
                 y = show_freq["Count"], 
                 hue=show_freq["Show_type"], 
                 palette="Blues_d")
plt.title("Movies vs. Shows")
plt.show()

In [None]:
plt.figure(figsize=(10,5))
ax = sns.barplot(y = df_my_activity['Show_name'].sort_values(ascending=True).value_counts().index[:10], 
                 x = df_my_activity['Show_name'].sort_values(ascending=True).value_counts()[:10])
plt.title("Shows with most time spent")
plt.xlabel("Show/Movie Name")
plt.ylabel("Frequency of Watching")
loc, labels = plt.xticks()
# ax.set_xticklabels(labels, rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10,5))
ax = sns.barplot(y = df_my_activity['Show_name'].sort_values(ascending=True).value_counts().index[-15:], 
                 x = df_my_activity['Show_name'].sort_values(ascending=True).value_counts()[-15:])
plt.title("Shows with least time spent")
plt.xlabel("Show/Movie Name")
plt.ylabel("Frequency of Watching")
loc, labels = plt.xticks()
plt.show()

## Analysis based on Duration

In [None]:
avg_seconds_per_day = df_my_activity.groupby('Day_of_week')['Duration_Seconds'].mean().sort_values()

sns.set_style("darkgrid")
font = {'family': 'serif',
        'color': '#004466',
        'weight': 'normal',
        'size': 15}
plt.figure(figsize=(10,5))
ax = sns.barplot(x = avg_seconds_per_day.index, 
                 y = avg_seconds_per_day.values, 
                 palette=("Blues_d"))
ax.set_ylabel('Viewing Duration (in Seconds)',fontdict={'size': 13, 'family': 'serif'})
ax.set_xlabel('Day of Week', fontdict={'size': 13, 'family': 'serif'})
ax.set_title('Average Viewing Time per Day (in seconds)', fontdict=font)
ax.tick_params(axis = 'both', labelsize= 12)

In [None]:
most_watched_tv_series = df_my_activity.groupby('Show_name')['Duration_Seconds'].mean().reset_index().sort_values(by = 'Duration_Seconds', ascending = False) 
font = {‘family’: ‘serif’, ‘color’: ‘#004466’, ‘weight’: ‘normal’, ‘size’: 16} 
plt.figure(figsize=(22, 12)) 
ax = sns.barplot(y = most_watched_tv_series[‘TV Show’][:10], x = most_watched_tv_series[‘Duration (Seconds)’][:10], orient = ‘h’, ci = ‘None’) 
ax.set_ylabel(‘TV Series’, fontdict={‘size’: 13, ‘family’: ‘serif’}) ax.set_xlabel(‘Watch Time (in seconds)’, fontdict={‘size’: 13, ‘family’: ‘serif’}) 
ax.set_title(‘Top 10 TV Shows by Average Duration (seconds)’, fontdict=font) 
ax.tick_params(axis = ‘both’, labelsize= 12) 