In [64]:
import pandas as pd
import numpy as np
import matplotlib as plt


In [65]:
#Display Settings
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',100)

In [66]:
#load dataset
df=pd.read_csv('netflix_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


# Data Understanding

In [67]:
df.shape


(8807, 12)

In [68]:
df.describe(include='all')

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
count,8807,8807,8807.0,6173,7982,7976,8797,8807.0,8803,8804,8807,8807
unique,8807,2,8804.0,4528,7692,748,1767,,17,220,514,8775
top,s1,Movie,46249.0,Rajiv Chilaka,David Attenborough,United States,"January 1, 2020",,TV-MA,1 Season,"Dramas, International Movies","Paranormal activity at a lush, abandoned prope..."
freq,1,6131,2.0,19,19,2818,109,,3207,1793,362,4
mean,,,,,,,,2014.180198,,,,
std,,,,,,,,8.819312,,,,
min,,,,,,,,1925.0,,,,
25%,,,,,,,,2013.0,,,,
50%,,,,,,,,2017.0,,,,
75%,,,,,,,,2019.0,,,,


In [69]:
df.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [71]:
df.isna().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

# Data Cleaning

In [72]:
df.isna().sum().sort_values(ascending=False)

director        2634
country          831
cast             825
date_added        10
rating             4
duration           3
show_id            0
type               0
title              0
release_year       0
listed_in          0
description        0
dtype: int64

In [73]:
cols_unknown=['director','country','cast']
#taking a list of all columns with null values we can take single column at a time and use fillna method individually also.


In [74]:
df[cols_unknown]=df[cols_unknown].fillna('Unknown').astype(str)

In [75]:
df[['director','country','cast']].isna().sum()
# it confirms all the null values are being replaced with unknown as they are crucial for analysis

director    0
country     0
cast        0
dtype: int64

In [76]:
df['date_added']=pd.to_datetime(df['date_added'],errors='coerce')

In [77]:
# we gonna drop the remmmaing values in date_added which is 98 because its approx 1.1% of total data in date_added and Imputing values would introduce artificial or misleading data.
df=df.dropna(subset='date_added')

In [79]:
df['rating']=df['rating'].fillna(df['rating'].mode()[0])
#we used mode to basically select the first most frequent value and fill the null values in the rating column


In [80]:
df=df.dropna(subset='duration')

In [81]:
df.isna().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

In [83]:
df.info()
df.shape

<class 'pandas.core.frame.DataFrame'>
Index: 8706 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8706 non-null   object        
 1   type          8706 non-null   object        
 2   title         8706 non-null   object        
 3   director      8706 non-null   object        
 4   cast          8706 non-null   object        
 5   country       8706 non-null   object        
 6   date_added    8706 non-null   datetime64[ns]
 7   release_year  8706 non-null   int64         
 8   rating        8706 non-null   object        
 9   duration      8706 non-null   object        
 10  listed_in     8706 non-null   object        
 11  description   8706 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(10)
memory usage: 884.2+ KB


(8706, 12)