In [2]:
# packages
import pandas as pd

In [31]:
netflix_data = pd.read_csv("dataset/netflix_dataset.csv", index_col=0)
netflix_data

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...
s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


Exploring the data

In [32]:
# find out the data types in your data
netflix_data.dtypes

type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object

In [33]:
# find out the columns of your dataset
netflix_data.columns

Index(['type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [34]:
# describe mean,min,max,25%, 50%, 75% etc
netflix_data.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [42]:
# Finding the number of NULL values in @ column
# finding if you have any missing values
netflix_data.isna().sum()

type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           0
listed_in          0
description        0
dtype: int64

Data cleaning
-----
1. Duration
----

In [44]:
# find the number of non unique values 
netflix_data['director'].nunique()

4528

In [39]:
# Find rows with missing 'duration'
null_rows_duration = netflix_data[netflix_data['duration'].isnull()]

# Define a custom function to swap 'duration' and 'rating'
def swap_duration_rating(row):
    if pd.isna(row['duration']):
        row['duration'], row['rating'] = row['rating'], row['duration']
    return row

# Apply the custom function to swap the values
null_rows_duration = null_rows_duration.apply(swap_duration_rating, axis=1)

# Add the swapped data back to the main DataFrame 'netflix_data'
netflix_data = netflix_data.combine_first(null_rows_duration)

# Now, the values should be swapped
# print(null_rows_duration)

In [43]:
netflix_data.isna().sum()

type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           0
listed_in          0
description        0
dtype: int64

2. Cleaning Rating Column
-----

In [120]:
# find unique values for rating 
list_rating = netflix_data['rating'].unique()
list_rating

array(['PG-13', 'TV-MA', 'TV-14', 'TV-Y', 'TV-Y7', 'R', 'PG', 'TV-PG',
       'TV-G', 'G', 'NC-17', 'NR', 'TV-Y7-FV', 'UR'], dtype=object)

In [50]:
# Count the values in the 
netflix_data['rating'].value_counts()

rating
TV-MA       3207
TV-14       2160
TV-PG        863
R            799
PG-13        490
TV-Y7        334
TV-Y         307
PG           287
TV-G         220
NR            80
G             41
TV-Y7-FV       6
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: count, dtype: int64

In [47]:
# Null values in rating
netflix_data.loc[netflix_data['rating'].isnull()]


Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s5990,Movie,13TH: A Conversation with Oprah Winfrey & Ava ...,,"Oprah Winfrey, Ava DuVernay",,"January 26, 2017",2017,,37 min,Movies,Oprah Winfrey sits down with director Ava DuVe...
s6828,TV Show,Gargantia on the Verdurous Planet,,"Kaito Ishikawa, Hisako Kanemoto, Ai Kayano, Ka...",Japan,"December 1, 2016",2013,,1 Season,"Anime Series, International TV Shows","After falling through a wormhole, a space-dwel..."
s7313,TV Show,Little Lunch,,"Flynn Curry, Olivia Deeble, Madison Lu, Oisín ...",Australia,"February 1, 2018",2015,,1 Season,"Kids' TV, TV Comedies","Adopting a child's perspective, this show take..."
s7538,Movie,My Honor Was Loyalty,Alessandro Pepe,"Leone Frisa, Paolo Vaccarino, Francesco Miglio...",Italy,"March 1, 2017",2015,,115 min,Dramas,"Amid the chaos and horror of World War II, a c..."


In [57]:
# Find the row with the title "My Honor Was Loyalty" and set its "rating" to "PG-13"
netflix_data.loc[netflix_data['title'] == 'My Honor Was Loyalty', 'rating'] = 'PG-13'
netflix_data.loc[netflix_data['title'] == 'Little Lunch', 'rating'] = 'TV-MA'
netflix_data.loc[netflix_data['title'] == 'Gargantia on the Verdurous Planet', 'rating'] = 'TV-14'
netflix_data.loc[netflix_data['cast'] == 'Oprah Winfrey, Ava DuVernay', 'rating'] = 'PG'


In [58]:
netflix_data.loc[netflix_data['rating'].isnull()]

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [63]:
# filtering rating with values 74 84 66 min 
netflix_data.loc[netflix_data['rating'].isin(['74 min', '84 min', '66 min'])]

# replacing values in rating 
# netflix_data.loc[netflix_data['title'] == 'Louis C.K.: Live at the Comedy Store', 'rating'] = 'TV-MA'
# netflix_data.loc[netflix_data['title'] == 'Louis C.K.: Hilarious', 'rating'] = 'TV-MA'
# netflix_data.loc[netflix_data['title'] == 'Louis C.K. 2017', 'rating'] = 'TV-MA'

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [65]:
# check if column rating is sorted 
# netflix_data['rating'].unique()

# check the count of the values in the column rating
netflix_data['rating'].value_counts()

rating
TV-MA       3211
TV-14       2161
TV-PG        862
R            799
PG-13        491
TV-Y7        334
TV-Y         307
PG           289
TV-G         220
NR            80
G             41
TV-Y7-FV       6
NC-17          3
UR             3
Name: count, dtype: int64

3. Cleaning Date Added
----

In [76]:
# netflix_data['date_added'].unique()
# netflix_data['date_added'].value_counts().sort_values(ascending=False)
netflix_data.loc[netflix_data['date_added'].isnull()]

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s6067,TV Show,A Young Doctor's Notebook and Other Stories,,"Daniel Radcliffe, Jon Hamm, Adam Godley, Chris...",United Kingdom,,2013,TV-MA,2 Seasons,"British TV Shows, TV Comedies, TV Dramas","Set during the Russian Revolution, this comic ..."
s6175,TV Show,Anthony Bourdain: Parts Unknown,,Anthony Bourdain,United States,,2018,TV-PG,5 Seasons,Docuseries,This CNN original series has chef Anthony Bour...
s6796,TV Show,Frasier,,"Kelsey Grammer, Jane Leeves, David Hyde Pierce...",United States,,2003,TV-PG,11 Seasons,"Classic & Cult TV, TV Comedies",Frasier Crane is a snooty but lovable Seattle ...
s6807,TV Show,Friends,,"Jennifer Aniston, Courteney Cox, Lisa Kudrow, ...",United States,,2003,TV-14,10 Seasons,"Classic & Cult TV, TV Comedies",This hit sitcom follows the merry misadventure...
s6902,TV Show,Gunslinger Girl,,"Yuuka Nanri, Kanako Mitsuhashi, Eri Sendai, Am...",Japan,,2008,TV-14,2 Seasons,"Anime Series, Crime TV Shows","On the surface, the Social Welfare Agency appe..."
s7197,TV Show,Kikoriki,,Igor Dmitriev,,,2010,TV-Y,2 Seasons,Kids' TV,A wacky rabbit and his gang of animal pals hav...
s7255,TV Show,La Familia P. Luche,,"Eugenio Derbez, Consuelo Duval, Luis Manuel Áv...",United States,,2012,TV-14,3 Seasons,"International TV Shows, Spanish-Language TV Sh...","This irreverent sitcom featues Ludovico, Feder..."
s7407,TV Show,Maron,,"Marc Maron, Judd Hirsch, Josh Brener, Nora Zeh...",United States,,2016,TV-MA,4 Seasons,TV Comedies,"Marc Maron stars as Marc Maron, who interviews..."
s7848,TV Show,Red vs. Blue,,"Burnie Burns, Jason Saldaña, Gustavo Sorola, G...",United States,,2015,NR,13 Seasons,"TV Action & Adventure, TV Comedies, TV Sci-Fi ...","This parody of first-person shooter games, mil..."
s8183,TV Show,The Adventures of Figaro Pho,,"Luke Jurevicius, Craig Behenna, Charlotte Haml...",Australia,,2015,TV-Y7,2 Seasons,"Kids' TV, TV Comedies","Imagine your worst fears, then multiply them: ..."


In [77]:
# Replacing NAN values in 'date_added' to January 1, 2020
netflix_data["date_added"].fillna("January 1, 2020", inplace = True)

In [78]:
netflix_data.loc[netflix_data['date_added'].isnull()]

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


4. Cleaning country
----

In [79]:
netflix_data.isna().sum()

type               0
title              0
director        2634
cast             825
country          831
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
description        0
dtype: int64

In [82]:
# netflix_data['country'].unique()
netflix_data['country'].value_counts().sort_values(ascending=False)
# netflix_data.loc[netflix_data['country'].isnull()]

country
United States                                 2818
India                                          972
United Kingdom                                 419
Japan                                          245
South Korea                                    199
                                              ... 
Russia, Spain                                    1
Egypt, Austria, United States                    1
France, Netherlands, South Africa, Finland       1
United States, East Germany, West Germany        1
United States, United Kingdom, New Zealand       1
Name: count, Length: 748, dtype: int64

In [93]:
# country col fill NAN with unknown 
netflix_data['country'].fillna("Unknown", inplace=True)

# check for null values
netflix_data.loc[netflix_data['country'].isnull()]

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [None]:
#this code filters data based on the country 'United States' and the output is saved in a csv file 
output_filename = 'us_country_count.csv'

#
us_data = netflix_data[netflix_data['country'].str.contains('United States')]
# count_us = us_data['country'].value_counts()
count_us = us_data['country'].value_counts().reset_index()
count_us.columns = ['Country', 'Count']

count_us.to_csv(output_filename, index=False)

print(count_us)

In [None]:
# Filtering the df(netflix_data) on data containing 'United States'
netflix_data[netflix_data['country'].str.contains('United States')]

5. Cleaning up Cast
----

In [115]:
netflix_data.isna().sum()

type               0
title              0
director        2634
cast             825
country            0
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
description        0
dtype: int64

In [119]:
# checking for null values
netflix_data.loc[netflix_data['cast'].isnull()]
# netflix_data['cast'].sample(10)

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
s1006,Movie,Keymon and Nani in Space Adventure,,,Unknown,"April 20, 2021",2013,TV-Y,76 min,"Children & Family Movies, Comedies","For Rohan and his magical pal, Keymon, a trip ..."
s1012,Movie,Free to Play,,,United States,"April 19, 2021",2014,TV-14,76 min,Documentaries,This documentary follows three professional vi...
s102,Movie,Untold: Breaking Point,"Chapman Way, Maclain Way",,United States,"September 7, 2021",2021,TV-MA,80 min,"Documentaries, Sports Movies",Under pressure to continue a winning tradition...
s103,TV Show,Countdown: Inspiration4 Mission to Space,Jason Hehir,,Unknown,"September 6, 2021",2021,TV-14,1 Season,"Docuseries, Science & Nature TV","From training to launch to landing, this all-a..."
...,...,...,...,...,...,...,...,...,...,...,...
s92,Movie,The Women and the Murderer,"Mona Achache, Patricia Tourancheau",,France,"September 9, 2021",2021,TV-14,92 min,"Documentaries, International Movies",This documentary traces the capture of serial ...
s920,TV Show,The Sons of Sam: A Descent into Darkness,Joshua Zeman,,United States,"May 5, 2021",2021,TV-MA,1 Season,"Crime TV Shows, Docuseries",The Son of Sam case grew into a lifelong obses...
s926,TV Show,Angelina Ballerina,,,Unknown,"May 1, 2021",2010,TV-Y,2 Seasons,"British TV Shows, Kids' TV","Angelina is crazy about ballet dancing, even t..."
s937,TV Show,Miniforce: Super Dino Power,,,Unknown,"May 1, 2021",2020,TV-Y7,1 Season,Kids' TV,Animals with special powers transform into sup...


In [125]:
# filling NULL/NAN with unknown 
netflix_data['cast'].fillna("Unknown", inplace=True)
# check for null values
netflix_data.loc[netflix_data['cast'].isnull()]

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


6. Cleaning Directors
----

In [126]:
netflix_data.isna().sum() 

type               0
title              0
director        2634
cast               0
country            0
date_added         0
release_year       0
rating             0
duration           0
listed_in          0
description        0
dtype: int64

In [128]:
# checking for null values
netflix_data.loc[netflix_data['director'].isnull()]

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
s100,TV Show,On the Verge,,"Julie Delpy, Elisabeth Shue, Sarah Jones, Alex...","France, United States","September 7, 2021",2021,TV-MA,1 Season,"TV Comedies, TV Dramas","Four women — a chef, a single mom, an heiress ..."
s1004,TV Show,Zero,,"Giuseppe Dave Seke, Haroun Fall, Beatrice Gran...",Italy,"April 21, 2021",2021,TV-MA,1 Season,"International TV Shows, TV Comedies, TV Dramas",A shy teen with the power to turn invisible mu...
s1005,TV Show,Izzy's Koala World,,"Izzy Bee, Ali Bee, Tim Bee",Australia,"April 20, 2021",2021,TV-Y,2 Seasons,Kids' TV,Young koala caretaker Izzy Bee and her family ...
s1006,Movie,Keymon and Nani in Space Adventure,,Unknown,Unknown,"April 20, 2021",2013,TV-Y,76 min,"Children & Family Movies, Comedies","For Rohan and his magical pal, Keymon, a trip ..."
s101,TV Show,Tobot Galaxy Detectives,,"Austin Abell, Travis Turner, Cole Howard, Anna...",Unknown,"September 7, 2021",2019,TV-Y7,2 Seasons,Kids' TV,An intergalactic device transforms toy cars in...
...,...,...,...,...,...,...,...,...,...,...,...
s98,TV Show,Kid Cosmic,,"Jack Fisher, Tom Kenny, Amanda C. Miller, Kim ...",United States,"September 7, 2021",2021,TV-Y7,2 Seasons,"Kids' TV, TV Comedies, TV Sci-Fi & Fantasy",A boy's superhero dreams come true when he fin...
s99,TV Show,Octonauts: Above & Beyond,,"Antonio Aakeel, Chipo Chung, Simon Foster, Ter...",United Kingdom,"September 7, 2021",2021,TV-Y,1 Season,"British TV Shows, Kids' TV",The Octonauts expand their exploration beyond ...
s994,TV Show,Shadow and Bone,,"Jessie Mei Li, Archie Renaux, Ben Barnes, Fred...",United States,"April 23, 2021",2021,TV-14,1 Season,"TV Action & Adventure, TV Dramas, TV Sci-Fi & ...",Dark forces conspire against orphan mapmaker A...
s998,TV Show,Life in Color with David Attenborough,,David Attenborough,"Australia, United Kingdom","April 22, 2021",2021,TV-PG,1 Season,"British TV Shows, Docuseries, International TV...","Using innovative technology, this docuseries e..."


In [129]:
# filling NULL/NAN with unknown 
netflix_data['director'].fillna("Unknown", inplace=True)
# check for null values
netflix_data.loc[netflix_data['director'].isnull()]

Unnamed: 0_level_0,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
show_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


Check if data is consistent with no NULL/NAN values
-----

In [130]:
netflix_data.isna().sum() 

type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

2. FILTER ROWS WHERE A PARTIAL STRING IS PRESENT

In [None]:
horror_list = netflix_data['listed_in'].str.contains('horror',case=False, na=False)
# netflix_data[horror_list].sample(10)
netflix_data[horror_list]