##Importing Libraries

In [14]:
#!pip uninstall plotly
#!pip install plotly==2#.7.0

In [13]:
!pip install plotly.express

Collecting plotly.express
  Downloading plotly_express-0.4.1-py2.py3-none-any.whl (2.9 kB)
Collecting plotly>=4.1.0
  Downloading plotly-5.4.0-py2.py3-none-any.whl (25.3 MB)
[K     |████████████████████████████████| 25.3 MB 1.2 MB/s 
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly, plotly.express
  Attempting uninstall: plotly
    Found existing installation: plotly 2.7.0
    Uninstalling plotly-2.7.0:
      Successfully uninstalled plotly-2.7.0
Successfully installed plotly-5.4.0 plotly.express-0.4.1 tenacity-8.0.1


In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

Let's see the Netflix Data

In [2]:
netflix_data = pd.read_csv('/content/netflix_titles.csv')
netflix_data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [3]:
netflix_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


Finding null values

In [4]:
netflix_data.isnull().sum() #returns the number of missing values

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

Unique Values

In [5]:
netflix_data.nunique()

show_id         8807
type               2
title           8807
director        4528
cast            7692
country          748
date_added      1767
release_year      74
rating            17
duration         220
listed_in        514
description     8775
dtype: int64

# Data Cleaning

In [6]:
netflix_data = netflix_data.dropna(how='any', subset=['cast', 'director'])
netflix_data = netflix_data.dropna()

In [7]:
#Replacing Null values with 'missing'
netflix_data['country'].fillna('missing', inplace=True )
netflix_data['date_added'].fillna('missing', inplace=True )
netflix_data['rating'].fillna('missing', inplace=True )
netflix_data.isnull().sum().sum()

0

In [8]:
# Converting date_added into proper format
netflix_data["date_added"] = pd.to_datetime(netflix_data['date_added'])
netflix_data['year_added'] = netflix_data['date_added'].dt.year
netflix_data['month_added'] = netflix_data['date_added'].dt.month


In [9]:
# finding seasons from durations
netflix_data['season_count'] = netflix_data.apply(lambda x : x['duration'].split(" ")[0] if "Season" in x['duration'] else "", axis = 1)
netflix_data['duration'] = netflix_data.apply(lambda x : x['duration'].split(" ")[0] if "Season" not in x['duration'] else "", axis = 1)

In [10]:
#Renaming the ‘listed_in’ feature to the genre for easy use.
netflix_data = netflix_data.rename(columns={"listed_in":"genre"})
netflix_data['genre'] = netflix_data['genre'].apply(lambda x: x.split(",")[0])

In [11]:
netflix_data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genre,description,year_added,month_added,season_count
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",2021-09-24,1993,TV-MA,125.0,Dramas,"On a photo shoot in Ghana, an American model s...",2021,9,
8,s9,TV Show,The Great British Baking Show,Andy Devonshire,"Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...",United Kingdom,2021-09-24,2021,TV-14,,British TV Shows,A talented batch of amateur bakers face off in...,2021,9,9.0
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021-09-24,2021,PG-13,104.0,Comedies,A woman adjusting to life after a loss contend...,2021,9,
12,s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic",2021-09-23,2021,TV-MA,127.0,Dramas,After most of her family is murdered in a terr...,2021,9,
24,s25,Movie,Jeans,S. Shankar,"Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi...",India,2021-09-21,1998,TV-14,166.0,Comedies,When the father of the man she loves insists t...,2021,9,


# Data Visualization

## Chart 01

In [12]:
# Heatmap
# Correlation between the feature show with the help of visualization
correlation_data = netflix_data.corr()
fig_heatmap = ff.create_annotated_heatmap(
    z = correlation_data.values,
    x = list(correlation_data.columns),
    y = list(correlation_data.index),
    annotation_text = correlation_data
    .round(2).values,
    showscale = True
)

fig_heatmap.update_layout(title = 'Correlation of Complete Data', 
                          plot_bgcolor = '#2d3035', paper_bgcolor = '#2d3035',
                          title_font = dict(size = 25, color = '#a5a7ab', family = "Muli, sans-serif"),
                          font = dict(color = '#131414'))

## Chart 02

In [13]:
fig_donut = px.pie(netflix_data, names='type', height= 300, width= 600, hole= 0.7, 
                   title = 'Most watched on Netflix',
                   color_discrete_sequence = ['#6b070c', '#080707'])
#updating figure
fig_donut.update_traces(hovertemplate = None, textposition = 'outside',
                        textinfo = 'percent + label', rotation = 90)

fig_donut.update_layout(margin = dict(t=60, b= 30, l=0, r=0), showlegend = False,
                        plot_bgcolor = '#333', paper_bgcolor = '#333',
                        title_font=dict(size=45, color='#8a8d93', family="Lato, sans-serif"),
                        font=dict(size=17, color='#8a8d93'),
                        hoverlabel=dict(bgcolor="#444", font_size=13, font_family="Lato, sans-serif")
                        )

## Chart 03

In [21]:
netflix_rating = pd.DataFrame(netflix_data['rating'].value_counts()).reset_index().rename(columns={'index':'rating','rating':'count'})

fig_bar = px.bar(netflix_rating, y='rating', x='count', title='Distribution of Rating',
                 color_discrete_sequence=['#b20710'], text='count')
fig_bar.update_xaxes(visible=False)
fig_bar.update_yaxes(showgrid=False, categoryorder='total ascending', ticksuffix='  ', showline=False)
fig_bar.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig_bar.update_layout(margin=dict(t=80, b=20, l=70, r=40),
                      hovermode="y unified", 
                      xaxis_title=' ', yaxis_title=" ",
                      plot_bgcolor='#333', paper_bgcolor='#333',
                      title_font=dict(size=35, color='#8a8d93', family="Lato, sans-serif"),
                      font=dict(color='#8a8d93'),
                      legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5),
                      hoverlabel=dict(bgcolor="black", font_size=13, font_family="Lato, sans-serif"))

## Chart 04

In [15]:
# making a copy of netflix_data
dff = netflix_data.copy()

In [16]:
# making 2 df one for tv show and another for movie with rating 
df_tv_show = dff[dff['type']=='TV Show'][['rating', 'type']].rename(columns={'type':'tv_show'})
df_movie = dff[dff['type']=='Movie'][['rating', 'type']].rename(columns={'type':'movie'})
df_tv_show = pd.DataFrame(df_tv_show.rating.value_counts()).reset_index().rename(columns={'index':'tv_show'})
df_tv_show['rating_final'] = df_tv_show['rating'] 
# making rating column value negative
df_tv_show['rating'] *= -1
df_movie = pd.DataFrame(df_movie.rating.value_counts()).reset_index().rename(columns={'index':'movie'})

In [22]:
fig = make_subplots(rows=1, cols=2, specs=[[{}, {}]], shared_yaxes=True, horizontal_spacing=0)
# bar plot for tv shows
fig.append_trace(go.Bar(x=df_tv_show.rating, y=df_tv_show.tv_show, orientation='h', showlegend=True, 
                        text=df_tv_show.rating_final, name='TV Show', marker_color='#221f1f'), 1, 1)
# bar plot for movies
fig.append_trace(go.Bar(x=df_movie.rating, y=df_movie.movie, orientation='h', showlegend=True, text=df_movie.rating,
                        name='Movie', marker_color='#b20710'), 1, 2)
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False, categoryorder='total ascending', ticksuffix=' ', showline=False)
fig.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig.update_layout(title='Which has the highest rating TV shows or Movies?',
                  margin=dict(t=80, b=0, l=70, r=40),
                  hovermode="y unified", 
                  xaxis_title=' ', yaxis_title=" ",
                  plot_bgcolor='#333', paper_bgcolor='#333',
                  title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93'),
                  legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5),
                  hoverlabel=dict(bgcolor="black", font_size=13, font_family="Lato, sans-serif"))

fig.add_annotation(dict(x=0.81, y=0.6, ax=0, ay=0,
                    xref = "paper", yref = "paper",
                    text= "<b>97%</b> people prefer Movies over TV Shows on Netflix.<br> Large number of people watch TV-MA rating  <br> Movies which are for mature audience."
                  ))
fig.add_annotation(dict(x=0.2, y=0.2, ax=0, ay=0,
                    xref = "paper", yref = "paper",
                    text= "<b>3%</b> people prefer TV Shows on Netflix.<br> There is no inappropriate content for<br> ages 17 and under in TV Shows."
                  ))

## Chart 05

In [23]:
netflix_month = pd.DataFrame(netflix_data.month_added.value_counts()).reset_index().rename(columns={'index':'month','month_added':'count'})
# Replacing month number to month name for a better visualization
netflix_month['month_final'] = netflix_month['month'].replace({1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'June', 7:'July', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'})
netflix_month[:4]

Unnamed: 0,month,count,month_final
0,10,491,Oct
1,12,490,Dec
2,1,489,Jan
3,4,471,Apr


In [24]:
fig_month = px.funnel(netflix_month, x='count', y='month_final', title='Best month for releasing Content',
                      height=350, width=600, color_discrete_sequence=['#b20710'])
fig_month.update_xaxes(showgrid=False, ticksuffix=' ', showline=True)
fig_month.update_traces(hovertemplate=None, marker=dict(line=dict(width=0)))
fig_month.update_layout(margin=dict(t=60, b=20, l=70, r=40),
                        xaxis_title=' ', yaxis_title=" ",
                        plot_bgcolor='#333', paper_bgcolor='#333',
                        title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                        font=dict(color='#8a8d93'),
                        hoverlabel=dict(bgcolor="black", font_size=13, font_family="Lato, sans-serif"))

## Chart 06

In [25]:
df_genre = pd.DataFrame(netflix_data.genre.value_counts()).reset_index().rename(columns={'index':'genre', 'genre':'count'})
fig_tree = px.treemap(df_genre, path=[px.Constant("Distribution of Geners"), 'count','genre'])
fig_tree.update_layout(title='Highest watched Geners on Netflix',
                  margin=dict(t=50, b=0, l=70, r=40),
                  plot_bgcolor='#fff', paper_bgcolor='#fff',
                  title_font=dict(size=25, color='#333', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93'),
                  hoverlabel=dict(bgcolor="#444", font_size=13, font_family="Lato, sans-serif"))

## Chart 07

In [26]:
# creating a dataframe which only consists of Movie shows
d2 = netflix_data[netflix_data["type"] == "Movie"]
d2[:3]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,genre,description,year_added,month_added,season_count
7,s8,Movie,Sankofa,Haile Gerima,"Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...","United States, Ghana, Burkina Faso, United Kin...",2021-09-24,1993,TV-MA,125,Dramas,"On a photo shoot in Ghana, an American model s...",2021,9,
9,s10,Movie,The Starling,Theodore Melfi,"Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...",United States,2021-09-24,2021,PG-13,104,Comedies,A woman adjusting to life after a loss contend...,2021,9,
12,s13,Movie,Je Suis Karl,Christian Schwochow,"Luna Wedler, Jannis Niewöhner, Milan Peschel, ...","Germany, Czech Republic",2021-09-23,2021,TV-MA,127,Dramas,After most of her family is murdered in a terr...,2021,9,


In [27]:
# creating a dataframe which shows how many movies were released each year
col = "year_added"

vc2 = d2[col].value_counts().reset_index().rename(columns = {col : "count", "index" : col})
vc2['percent'] = vc2['count'].apply(lambda x : 100*x/sum(vc2['count']))
vc2 = vc2.sort_values(col)
vc2[:3]

Unnamed: 0,year_added,count,percent
13,2008,1,0.019286
11,2009,2,0.038573
12,2010,1,0.019286


In [28]:
# Waterfall Chart
fig2 = go.Figure(go.Waterfall(
    name = "Movie", orientation = "v", 
    x = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021"],
    textposition = "auto",
    text = ["1", "2", "1", "13", "3", "6", "14", "48", "204", "743", "1121", "1366", "1228", "84"],
    y = [1, 2, -1, 13, -3, 6, 14, 48, 204, 743, 1121, 1366, -1228, -84],
    connector = {"line":{"color":"#b20710"}},
    increasing = {"marker":{"color":"#b20710"}},
    decreasing = {"marker":{"color":"orange"}},

))


fig2.update_xaxes(showgrid=False)
fig2.update_yaxes(showgrid=False, visible=False)
fig2.update_traces(hovertemplate=None)
fig2.update_layout(title='Watching Movies over the year', height=350,
                   margin=dict(t=80, b=20, l=50, r=50),
                   hovermode="x unified",
                   xaxis_title=' ', yaxis_title=" ",
                   plot_bgcolor='#333', paper_bgcolor='#333',
                   title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                   font=dict(color='#8a8d93'))

## Chart 08

In [31]:
d1 = netflix_data[netflix_data["type"] == "TV Show"]
d2 = netflix_data[netflix_data["type"] == "Movie"]

col = "year_added"

vc1 = d1[col].value_counts().reset_index().rename(columns = {col : "count", "index" : col})
vc1['percent'] = vc1['count'].apply(lambda x : 100*x/sum(vc1['count']))
vc1 = vc1.sort_values(col)

vc2 = d2[col].value_counts().reset_index().rename(columns = {col : "count", "index" : col})
vc2['percent'] = vc2['count'].apply(lambda x : 100*x/sum(vc2['count']))
vc2 = vc2.sort_values(col)

trace1 = go.Scatter(x=vc1[col], y=vc1["count"], name="TV Shows", marker=dict(color="orange"), )
trace2 = go.Scatter(x=vc2[col], y=vc2["count"], name="Movies", marker=dict(color="#b20710"))
data = [trace1, trace2]
fig_line = go.Figure(data)

fig_line.update_traces(hovertemplate=None)
fig_line.update_xaxes(showgrid=False)
fig_line.update_yaxes(showgrid=False)

large_title_format = 'Tv Show and Movies impact over the Year'
small_title_format = "<span style='font-size:13px; font-family:Tahoma'>Due to Covid updatation of content is slowed."
fig_line.update_layout(title=large_title_format + "<br>" + small_title_format, height=400,
                  margin=dict(t=130, b=0, l=70, r=40),
                  hovermode="x unified", 
                  xaxis_title=' ', yaxis_title=" ",
                  plot_bgcolor='#333', paper_bgcolor='#333',
                  title_font=dict(size=25, color='#8a8d93', family="Lato, sans-serif"),
                  font=dict(color='#8a8d93'),
                  legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="center", x=0.5))

fig_line.add_annotation(dict(x=0.8, y=0.3, ax=0, ay=0,
                    xref = "paper", yref = "paper", 
                    text= "Highest number of <b>Tv Shows</b><br> were released in <b>2020</b><br> followed by 2017."
                  ))
fig_line.add_annotation(dict(x=0.9, y=1, ax=0, ay=0,
                    xref = "paper", yref = "paper",
                    text= "Highest number of <b>Movies</b> were relased<br> in <b>2019</b> followed by 2020"
                  ))
fig_line.show()