# Module 1

In [1]:
import numpy as np
import pandas as pd
from plotly import express as px
from plotly import graph_objects as go


In [2]:
#  Load Dataset
raw_df = pd.read_csv('netflix_watching_history_week1.csv')

## EDA

In [3]:
raw_df.shape

(15060, 26)

In [4]:
raw_df.columns

Index(['User_ID', 'Age_Group', 'Gender', 'Show_ID', 'Title', 'Genre',
       'Viewing_Duration', 'Completion_Status', 'Rating', 'Device',
       'Number_of_Views', 'Skip_Scenes', 'Watchlist', 'Re-watched',
       'Viewing_Session_Length', 'Co-viewing', 'Likes', 'Shares', 'Comments',
       'Watch_Time_Of_Day', 'Region', 'Month', 'Day_of_Week', 'Release_Year',
       'Director', 'Cast'],
      dtype='object')

In [5]:
raw_df.isna().sum() * 100 / raw_df.shape[0]

User_ID                    0.0
Age_Group                  0.0
Gender                     0.0
Show_ID                    0.0
Title                      0.0
Genre                      0.0
Viewing_Duration           0.0
Completion_Status          0.0
Rating                     5.0
Device                     0.0
Number_of_Views            0.0
Skip_Scenes                0.0
Watchlist                  0.0
Re-watched                60.0
Viewing_Session_Length     0.0
Co-viewing                 0.0
Likes                      0.0
Shares                     0.0
Comments                  15.0
Watch_Time_Of_Day          0.0
Region                     0.0
Month                      0.0
Day_of_Week                0.0
Release_Year               0.0
Director                   0.0
Cast                       0.0
dtype: float64

In [6]:
raw_df.describe()

Unnamed: 0,User_ID,Show_ID,Viewing_Duration,Rating,Number_of_Views,Viewing_Session_Length,Release_Year
count,15060.0,15060.0,15060.0,14307.0,15060.0,15060.0,15060.0
mean,250.259562,50.745551,39.899867,3.002796,2.998938,69.871713,2011.814276
std,144.455465,28.873128,11.834115,1.409904,1.411675,29.26154,6.728716
min,1.0,1.0,20.0,1.0,1.0,20.0,2001.0
25%,124.0,25.0,30.0,2.0,2.0,44.0,2005.0
50%,249.0,51.0,40.0,3.0,3.0,70.0,2014.0
75%,377.0,76.0,50.0,4.0,4.0,95.0,2018.0
max,500.0,100.0,60.0,5.0,5.0,120.0,2023.0


In [7]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15060 entries, 0 to 15059
Data columns (total 26 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   User_ID                 15060 non-null  int64  
 1   Age_Group               15060 non-null  object 
 2   Gender                  15060 non-null  object 
 3   Show_ID                 15060 non-null  int64  
 4   Title                   15060 non-null  object 
 5   Genre                   15060 non-null  object 
 6   Viewing_Duration        15060 non-null  int64  
 7   Completion_Status       15060 non-null  object 
 8   Rating                  14307 non-null  float64
 9   Device                  15060 non-null  object 
 10  Number_of_Views         15060 non-null  int64  
 11  Skip_Scenes             15060 non-null  object 
 12  Watchlist               15060 non-null  object 
 13  Re-watched              6024 non-null   object 
 14  Viewing_Session_Length  15060 non-null

In [None]:
user_show_counts = raw_df.groupby(['User_ID', 'Show_ID']).size().reset_index(name='Watch_Count')

df = raw_df.merge(user_show_counts, on=['User_ID', 'Show_ID'], how='left')

df['Previous_Max_Duration'] = df.groupby(['User_ID', 'Show_ID'])['Viewing_Duration'].cummax().shift(1)
df['Duration_Decreased'] = df['Viewing_Duration'] < df['Previous_Max_Duration']

df.loc[(df['Watch_Count'] > 1) & df['Re-watched'].isnull(), 'Re-watched'] = 'Probably_Yes'
df.loc[(df['Watch_Count'] == 1) & df['Re-watched'].isnull(), 'Re-watched'] = 'Probably_No'

df.drop(columns=['Watch_Count', 'Previous_Max_Duration', 'Duration_Decreased'], inplace=True)


In [None]:
user_mode_rating = df.groupby(['User_ID', 'Show_ID'])['Rating'].transform(lambda x: x.fillna(x.mode()))

show_mode_rating = df.groupby('Show_ID')['Rating'].transform(lambda x: x.fillna(x.mode()))

df['Rating'] = user_mode_rating.fillna(show_mode_rating)

In [None]:
user_show_comments = df.groupby(['User_ID', 'Show_ID'])['Comments'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))

df['Comments'] = user_show_comments.fillna("No Comments")


  user_show_comments = df.groupby(['User_ID', 'Show_ID'])['Comments'].transform(lambda x: x.fillna(method='ffill').fillna(method='bfill'))


In [11]:
df.isna().sum() * 100 / df.shape[0]

User_ID                   0.0
Age_Group                 0.0
Gender                    0.0
Show_ID                   0.0
Title                     0.0
Genre                     0.0
Viewing_Duration          0.0
Completion_Status         0.0
Rating                    5.0
Device                    0.0
Number_of_Views           0.0
Skip_Scenes               0.0
Watchlist                 0.0
Re-watched                0.0
Viewing_Session_Length    0.0
Co-viewing                0.0
Likes                     0.0
Shares                    0.0
Comments                  0.0
Watch_Time_Of_Day         0.0
Region                    0.0
Month                     0.0
Day_of_Week               0.0
Release_Year              0.0
Director                  0.0
Cast                      0.0
dtype: float64

In [12]:
numeric_columns = ['Viewing_Duration', 'Number_of_Views', 'Viewing_Session_Length', 'Rating']
for col in df.columns:
    if col not in numeric_columns:
        df[col].astype('str')


In [13]:
shows_by_mode_rating = df.groupby("Title")["Rating"].agg(lambda x: x.mode()[0] if not x.mode().empty else None).reset_index()
rating_counts = df.groupby(["Title", "Rating"]).size().reset_index(name="Rating_Count")
shows_by_mode_rating = shows_by_mode_rating.merge(rating_counts, on = ['Title', 'Rating'], how = "left")
shows_by_mode_rating = shows_by_mode_rating.sort_values(by=['Rating', 'Rating_Count'], ascending=False)
top_shows_by_mode_rating = shows_by_mode_rating.head(10)
viewing_session_length = df[df["Title"].isin(top_shows_by_mode_rating["Title"])].groupby("Title")["Viewing_Session_Length"].mean().reset_index()
top_shows_by_mode_rating_and_viewing_session_lengths = top_shows_by_mode_rating.merge(viewing_session_length, on="Title")

fig = go.Figure()

fig.add_trace(go.Bar(
    x=top_shows_by_mode_rating_and_viewing_session_lengths["Title"], 
    y=top_shows_by_mode_rating_and_viewing_session_lengths["Viewing_Session_Length"], 
    name="Viewing Session Length",
    marker_color='royalblue',
    yaxis='y1'
))

fig.add_trace(go.Scatter(
    x=top_shows_by_mode_rating_and_viewing_session_lengths["Title"], 
    y=top_shows_by_mode_rating_and_viewing_session_lengths["Rating"], 
    mode='lines+markers',
    name="Mode Rating",
    line=dict(color='red', width=2),
    yaxis='y2'
))

fig.update_layout(
    title="Top 10 Shows by Mode Rating & Their Viewing Session Length",
    xaxis_title="Show Title",
    yaxis_title="Viewing Session Length (Bars) / Mode Rating (Line)",
    xaxis=dict(tickangle=0),
    
    yaxis=dict(
        title="Viewing Session Length",
        titlefont=dict(color="royalblue"),
        tickfont=dict(color="royalblue"),
        side="left"
    ),
    
    yaxis2=dict(
        title="Mode Rating",
        titlefont=dict(color="red"),
        tickfont=dict(color="red"),
        overlaying="y",
        side="right",
        range=[0,5]
    ),
    legend=dict(x=0.8, y=1.2),
    template="plotly_white"
)

fig.show()

In [14]:
rewatch_counts = df[(df['Re-watched'] == 'Yes') | (df['Re-watched'] == 'Probably_Yes')].groupby("Title").size().reset_index(name="Rewatch_Count")
share_counts = df[df["Shares"] == "Yes"].groupby("Title").size().reset_index(name="Share_Count")
view_count = df.groupby("Title").size().reset_index(name="Total_Views")

show_analysis = shows_by_mode_rating.merge(rewatch_counts, on="Title", how="left").merge(share_counts, on="Title", how="left").merge(view_count, on="Title", how="left")


show_analysis["Rewatch_Percentage"] = (show_analysis["Rewatch_Count"] / show_analysis["Total_Views"]) * 100
show_analysis["Share_Percentage"] = (show_analysis["Share_Count"] / show_analysis["Total_Views"]) * 100


top_shows = show_analysis.sort_values(by=["Rating", "Rating_Count"], ascending=[False, False]).head(10)


fig = go.Figure()


fig.add_trace(go.Bar(
    x=top_shows["Title"], 
    y=top_shows["Rewatch_Percentage"], 
    name="Rewatched Percentage",
    marker_color='royalblue',
    yaxis="y1"
))


fig.add_trace(go.Bar(
    x=top_shows["Title"], 
    y=top_shows["Share_Percentage"], 
    name="Shared Percentage",
    marker_color='green',
    yaxis="y1"
))


fig.add_trace(go.Scatter(
    x=top_shows["Title"], 
    y=top_shows["Rating"], 
    mode='lines+markers',
    name="Mode Rating",
    line=dict(color='red', width=2),
    yaxis="y2"
))


fig.update_layout(
    title="Top 10 Shows by Mode Rating & Rating Count (Rewatched & Shared %)",
    xaxis=dict(title="Show Title", tickangle=0),
    
    yaxis=dict(
        title="Rewatched & Shared Percentage",
        titlefont=dict(color="black"),
        tickfont=dict(color="black"),
        side="left"
    ),
    
    yaxis2=dict(
        title="Mode Rating",
        titlefont=dict(color="red"),
        tickfont=dict(color="red"),
        overlaying="y",
        side="right",
        range=[0, 5]  
    ),
    
    legend=dict(x=0.8, y=1.3),
    template="plotly_white",
    barmode='group'  
)


fig.show()


In [None]:
df["Engagement_Score"] = (df["Likes"].map({"Yes": 1, "No": 0}) + 
                          df["Shares"].map({"Yes": 1, "No": 0}) + 
                          df["Comments"].notna().astype(int) + 
                          df["Re-watched"].map({"Yes": 1, "No": 0}))


fig1 = px.scatter(df, x="Viewing_Duration", y="Engagement_Score", color="Device",
                  title="Identifying Second-Screen Content: Viewing Duration vs Engagement",
                  labels={"Viewing_Duration": "Viewing Duration (mins)", "Engagement_Score": "Engagement Score"},
                  opacity=0.7, hover_data=["Title", "Age_Group", "Watch_Time_Of_Day"])


df["Is_Second_Screen"] = (df["Viewing_Duration"] > df["Viewing_Duration"].median()) & (df["Engagement_Score"] <= 1)
second_screen_by_age = df.groupby("Age_Group")["Is_Second_Screen"].mean().reset_index()
second_screen_by_age["Is_Second_Screen"] *= 100 


fig2 = px.bar(second_screen_by_age, x="Age_Group", y="Is_Second_Screen", 
              title="Second-Screen Content by Age Group",
              labels={"Is_Second_Screen": "Percentage of Second-Screen Content (%)"},
              color="Age_Group", color_discrete_sequence=px.colors.qualitative.Set2)


fig3 = px.density_heatmap(df, x="Device", y="Watch_Time_Of_Day", z="Is_Second_Screen",
                          histfunc="avg", color_continuous_scale="Blues",
                          title="Heatmap of Second-Screen Behavior by Device & Watch Time",
                          labels={"Is_Second_Screen": "Avg Second-Screen Probability", "Device": "Device Type", "Watch_Time_Of_Day": "Time of Day"})


fig1.show()
fig2.show()
fig3.show()
