In [45]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.offline import iplot, plot
from plotly.subplots import make_subplots

colors = ["#8c0404","#f25ed0","#000000","#16A085","#34495E",
           "#21618C ","#512E5F","#45B39D","#AAB7B8 ","#20B2AA",
           "#FF69B4","#00CED1","#FF7F50","#7FFF00","#DA70D6"]

In [46]:

df = pd.read_csv('netflix1.csv')



In [47]:
df.sample(5)

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
8164,s5413,TV Show,Criminal Minds,Not Given,United States,6/30/2017,2017,TV-14,12 Seasons,"Crime TV Shows, TV Dramas, TV Mysteries"
2242,s2812,Movie,Digs & Discoveries: All Track's Lead to Rome,Joey So,Not Given,3/15/2020,2019,TV-Y,23 min,Children & Family Movies
2331,s2937,Movie,Azali,Kwabena Gyansah,Ghana,2/7/2020,2018,TV-14,89 min,"Dramas, International Movies"
123,s572,TV Show,Generation 56k,Not Given,Pakistan,7/1/2021,2021,TV-MA,1 Season,"International TV Shows, Romantic TV Shows, TV ..."
5253,s7205,Movie,Killers,Robert Luketic,United States,1/1/2021,2010,PG-13,100 min,"Action & Adventure, Comedies, Romantic Movies"


In [48]:
print(f"Number of column :{df.shape[1]}\nNumber of rows :{df.shape[0]}")

Number of column :10
Number of rows :8790


In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8790 non-null   object
 1   type          8790 non-null   object
 2   title         8790 non-null   object
 3   director      8790 non-null   object
 4   country       8790 non-null   object
 5   date_added    8790 non-null   object
 6   release_year  8790 non-null   int64 
 7   rating        8790 non-null   object
 8   duration      8790 non-null   object
 9   listed_in     8790 non-null   object
dtypes: int64(1), object(9)
memory usage: 686.8+ KB


In [50]:
df.isnull().sum()

show_id         0
type            0
title           0
director        0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
dtype: int64

In [51]:
df.describe()

Unnamed: 0,release_year
count,8790.0
mean,2014.183163
std,8.825466
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [52]:
df.describe(exclude=np.number)

Unnamed: 0,show_id,type,title,director,country,date_added,rating,duration,listed_in
count,8790,8790,8790,8790,8790,8790,8790,8790,8790
unique,8790,2,8787,4528,86,1713,14,220,513
top,s1,Movie,9-Feb,Not Given,United States,1/1/2020,TV-MA,1 Season,"Dramas, International Movies"
freq,1,6126,2,2588,3240,110,3205,1791,362


In [53]:
pd.DataFrame({
    'count':df.shape[0],
    'Nulls':df.isnull().sum(),
    'nulls%':df.isnull().sum()*100,
    'cardinality': df.nunique(),
})

Unnamed: 0,count,Nulls,nulls%,cardinality
show_id,8790,0,0,8790
type,8790,0,0,2
title,8790,0,0,8787
director,8790,0,0,4528
country,8790,0,0,86
date_added,8790,0,0,1713
release_year,8790,0,0,74
rating,8790,0,0,14
duration,8790,0,0,220
listed_in,8790,0,0,513


In [54]:
df.drop(columns=['show_id'],inplace=True)

In [55]:
df.sample(5)

Unnamed: 0,type,title,director,country,date_added,release_year,rating,duration,listed_in
5187,Movie,Jeremy Scott: The People's Designer,Vlad Yudin,United States,4/1/2016,2015,PG-13,109 min,Documentaries
6133,Movie,The Frozen Ground,Scott Walker,United States,8/27/2020,2013,R,106 min,"Dramas, Thrillers"
2130,Movie,My Stupid Boss,Upi Avianto,Indonesia,4/23/2020,2016,TV-G,107 min,"Comedies, International Movies"
8256,TV Show,Love Me or Leave Me,Not Given,Taiwan,11/1/2016,2012,TV-14,1 Season,"International TV Shows, Romantic TV Shows, TV ..."
86,Movie,Naruto the Movie 2: Legend of the Stone of Gelel,Hirotsugu Kawasaki,Japan,9/15/2021,2005,TV-PG,97 min,"Action & Adventure, Anime Features, Internatio..."


In [56]:
print(f"Number of column :{df.shape[1]}\nNumber of rows :{df.shape[0]}")

Number of column :9
Number of rows :8790


In [57]:
df.describe(exclude=np.number)

Unnamed: 0,type,title,director,country,date_added,rating,duration,listed_in
count,8790,8790,8790,8790,8790,8790,8790,8790
unique,2,8787,4528,86,1713,14,220,513
top,Movie,9-Feb,Not Given,United States,1/1/2020,TV-MA,1 Season,"Dramas, International Movies"
freq,6126,2,2588,3240,110,3205,1791,362


In [58]:
count_types=df['type'].value_counts()
count_types

Movie      6126
TV Show    2664
Name: type, dtype: int64

In [59]:
iplot(px.bar(count_types,
             text_auto=True,
             color = count_types.index,
             color_discrete_sequence = colors,
             title='Compare Between Two Types',
             labels=dict(index="count_types",value="count")
))

In [60]:
iplot(px.pie(values=count_types,
             names=['Movie','TV Show'],
             color_discrete_sequence = colors[7:9],
             title="Ram counts").update_traces(textinfo='value+percent'))

In [61]:
directors = df['director'].value_counts()
directors


Not Given                         2588
Rajiv Chilaka                       20
Alastair Fothergill                 18
Raúl Campos, Jan Suter              18
Suhas Kadav                         16
                                  ... 
Matt D'Avella                        1
Parthiban                            1
Scott McAboy                         1
Raymie Muzquiz, Stu Livingston       1
Mozez Singh                          1
Name: director, Length: 4528, dtype: int64

In [62]:
given_directors = directors.sum() - directors[0]
print(f'given directors = {given_directors}')

given directors = 6202


In [63]:
iplot(px.pie(values=[directors[0], given_directors],
             names=['Given Directors', 'Not Given Directors'],
             title='Given Directors Vs Not Given Directors',
             color_discrete_sequence=['#B81D24', '#221F1F']
            ).update_traces(textinfo='value+percent'))


In [64]:
px.bar(directors[1:11],
      x=directors[1:11],
      y=directors[1:11].index,
      color = directors[1:11].index,
      color_discrete_sequence = colors,
      text_auto = True,
      labels = dict(x='Number of movies', y='Directors'),
      orientation= 'h'
      )

In [65]:
countries = df['country'].value_counts()[:10]
countries


United States     3240
India             1057
United Kingdom     638
Pakistan           421
Not Given          287
Canada             271
Japan              259
South Korea        214
France             213
Spain              182
Name: country, dtype: int64

In [66]:
country_type = df.groupby(['country', 'type']).size().unstack(fill_value=0).reset_index()

country_type['Total'] = country_type['Movie'] + country_type['TV Show']

country_type = country_type[country_type['country'] != 'Not Given']

country_type = country_type.sort_values(by='Total', ascending=False)

colors = ['#B81D24', '#221F1F']

fig = px.bar(country_type.head(10), x='country', y=['Movie', 'TV Show'],
             labels={'value': 'Count', 'variable': 'Type'},
             title='Top 10 Countries and their Streamed Movies and TV Shows',
             barmode='group',
             color_discrete_map={key: value for key, value in zip(['Movie', 'TV Show'], colors)})

fig.update_traces(marker=dict(line=dict(width=4)))

fig.show()

In [67]:
df['date_added']=pd.to_datetime(df["date_added"])
df['date_added'].head(10)

0   2021-09-25
1   2021-09-24
2   2021-09-24
3   2021-09-22
4   2021-09-24
5   2021-09-24
6   2021-09-24
7   2021-05-01
8   2021-09-23
9   2021-05-01
Name: date_added, dtype: datetime64[ns]

In [68]:
print(df['date_added'].min())
print(df['date_added'].max())

2008-01-01 00:00:00
2021-09-25 00:00:00


In [69]:
release_year = df['release_year'].value_counts()
release_year.head(10)

2018    1146
2017    1030
2019    1030
2020     953
2016     901
2021     592
2015     555
2014     352
2013     286
2012     236
Name: release_year, dtype: int64

In [70]:
iplot(px.area(release_year,
             x=release_year.index,
             y=release_year))

In [71]:
shows_added_per_year = df.groupby(df['date_added'].dt.year)['type'].count()
shows_added_per_year

date_added
2008       2
2009       2
2010       1
2011      13
2012       3
2013      11
2014      24
2015      82
2016     426
2017    1185
2018    1648
2019    2016
2020    1879
2021    1498
Name: type, dtype: int64

In [72]:
iplot(px.line(shows_added_per_year,
    title='Number Of Shows Added per year',
    x = shows_added_per_year.index,
    y = shows_added_per_year,
    markers = True , line_shape='linear'
))

In [73]:
rating = df['rating'].value_counts()
rating.head(10)

TV-MA    3205
TV-14    2157
TV-PG     861
R         799
PG-13     490
TV-Y7     333
TV-Y      306
PG        287
TV-G      220
NR         79
Name: rating, dtype: int64

In [74]:
iplot(px.bar(rating,
    title="Shows Rating On Netflix",
    color = rating.index,
    orientation = 'h',
    height = 720,
    text_auto=True,
    labels =dict(index='Rating',value='Frequency'),
))

In [75]:
duration =df['duration'].value_counts()
duration.head(10)

1 Season     1791
2 Seasons     421
3 Seasons     198
90 min        152
97 min        146
93 min        146
94 min        146
91 min        144
95 min        137
96 min        130
Name: duration, dtype: int64

In [76]:
seasons = df[df['duration'].str.contains('Season')]
seasons_count = seasons['duration'].value_counts()
seasons_count

1 Season      1791
2 Seasons      421
3 Seasons      198
4 Seasons       94
5 Seasons       64
6 Seasons       33
7 Seasons       23
8 Seasons       17
9 Seasons        9
10 Seasons       6
15 Seasons       2
13 Seasons       2
12 Seasons       2
17 Seasons       1
11 Seasons       1
Name: duration, dtype: int64

In [77]:
iplot(px.bar(seasons_count,
    title="Season per TV Show",
    color = seasons_count.index,
    orientation = 'h',
    height = 720,
    text_auto=True,
    labels =dict(index='Seasons',value='Sum'),
))

In [78]:
categories = df['listed_in'].str.split(', ', expand=True)

categories = categories.melt(value_name='category').dropna()['category']

top_categories = categories.value_counts().head(10)

top_categories

International Movies        2752
Dramas                      2426
Comedies                    1674
International TV Shows      1349
Documentaries                869
Action & Adventure           859
TV Dramas                    762
Independent Movies           756
Children & Family Movies     641
Romantic Movies              616
Name: category, dtype: int64

In [79]:
top_categories_df = pd.DataFrame({'Category': top_categories.index, 'Count': top_categories.values})

fig = px.bar(top_categories_df, x='Count', y='Category', orientation='h',
             title='Top 10 Popular Categories for Movies & TV Shows',
             labels={'Count': 'Number of Shows', 'Category': 'Category'},
             color=top_categories_df.index,
             text='Count')

fig.show()