In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
df = pd.read_csv("netflix.csv")

In [5]:
df.head(3)

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...


### <font color='purple'>1. Defining Problem Statement and Analysing basic metrics</font>

1. **Problem Statement**
  - Analyze the netflix dataset to provide data-driven recommendation on the type of content (movies or TV shows)to produce.
  - Explore how Netflix can expand and grow its business in different countries.
2. **Basic metric Analysis**
  - Data 

### <font color='purple'>2. Observations on the shape of data, data types of all the attributes, conversion of categorical attributes to 'category' (If required), missing value detection, statistical summary </font>

In [6]:
print(df.shape)
print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns")

(8807, 12)
Dataset contains 8807 rows and 12 columns


In [7]:
print("Data types of Attribute: ")

print(df.dtypes)

Data types of Attribute: 
show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
dtype: object


In [8]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [9]:
df.isnull().sum()

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

 **Statistical Summery**

In [10]:
df.describe()

Unnamed: 0,release_year
count,8807.0
mean,2014.180198
std,8.819312
min,1925.0
25%,2013.0
50%,2017.0
75%,2019.0
max,2021.0


In [11]:
df.describe(include = "object")

Unnamed: 0,show_id,type,title,director,cast,country,date_added,rating,duration,listed_in,description
count,8807,8807,8807,6173,7982,7976,8797,8803,8804,8807,8807
unique,8807,2,8807,4528,7692,748,1767,17,220,514,8775
top,s1,Movie,Dick Johnson Is Dead,Rajiv Chilaka,David Attenborough,United States,"January 1, 2020",TV-MA,1 Season,"Dramas, International Movies","Paranormal activity at a lush, abandoned prope..."
freq,1,6131,1,19,19,2818,109,3207,1793,362,4


- **Missing Value Treatment**

In [12]:
df_missing =  pd.DataFrame({"Missing_values": df.isnull().sum(), "Percentage": round(df.isnull().sum()/len(df)*100, 2)})

In [13]:
df_missing

Unnamed: 0,Missing_values,Percentage
show_id,0,0.0
type,0,0.0
title,0,0.0
director,2634,29.91
cast,825,9.37
country,831,9.44
date_added,10,0.11
release_year,0,0.0
rating,4,0.05
duration,3,0.03


In [14]:
# fill null values with a specific values:
df['director'].fillna("Unknown director",inplace=True)
df['cast'].fillna("Unknown cast",inplace=True)
df['country'].fillna("Unknown country",inplace=True)
df['date_added'].fillna("January 1,1900",inplace=True)
df['duration'].fillna("Unknown duration",inplace=True)
df['rating'].fillna("Unknown rating",inplace=True)

In [15]:
df.isnull().sum()

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
dtype: int64

- When we converted categorical columns to category, internally stores them as integer codes instead of full strings.
- Saves memory by encoding repeated strings as numbers.
- Faster grouping, sorting, and aggregations.
- Prevents invalid category values

In [16]:
df["type"] = df["type"].astype("category")
df["country"] = df["country"].astype("category")
df["rating"] = df["rating"].astype("category")

In [17]:
df.dtypes

show_id           object
type            category
title             object
director          object
cast              object
country         category
date_added        object
release_year       int64
rating          category
duration          object
listed_in         object
description       object
dtype: object

### <font color='purple'>3. Non-Graphical Analysis: Value counts and unique attributes </font>

In [18]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')

In [19]:
df["type"].value_counts()

Movie      6131
TV Show    2676
Name: type, dtype: int64

In [20]:
df["country"].value_counts().head()

United States      2818
India               972
Unknown country     831
United Kingdom      419
Japan               245
Name: country, dtype: int64

In [21]:
df["rating"].value_counts()

TV-MA             3207
TV-14             2160
TV-PG              863
R                  799
PG-13              490
TV-Y7              334
TV-Y               307
PG                 287
TV-G               220
NR                  80
G                   41
TV-Y7-FV             6
Unknown rating       4
NC-17                3
UR                   3
74 min               1
84 min               1
66 min               1
Name: rating, dtype: int64

In [22]:
df["listed_in"].value_counts()

Dramas, International Movies                          362
Documentaries                                         359
Stand-Up Comedy                                       334
Comedies, Dramas, International Movies                274
Dramas, Independent Movies, International Movies      252
                                                     ... 
Kids' TV, TV Action & Adventure, TV Dramas              1
TV Comedies, TV Dramas, TV Horror                       1
Children & Family Movies, Comedies, LGBTQ Movies        1
Kids' TV, Spanish-Language TV Shows, Teen TV Shows      1
Cult Movies, Dramas, Thrillers                          1
Name: listed_in, Length: 514, dtype: int64

In [23]:
df["release_year"].value_counts()

2018    1147
2017    1032
2019    1030
2020     953
2016     902
        ... 
1959       1
1925       1
1961       1
1947       1
1966       1
Name: release_year, Length: 74, dtype: int64

### <font color='purple'>4. Visual Analysis - Univariate, Bivariate after pre-processing of the data </font>

In [24]:
df["cast_split"] = df["cast"].str.split(", ")
df = df.explode("cast_split")
df["director_split"] = df["director"].str.split(", ")
df = df.explode("director_split")
df["country_split"] = df["country"].str.split(", ")
df = df.explode("country_split")
df['listed_in_split'] = df['listed_in'].str.split(', ')
df = df.explode('listed_in_split')

In [25]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'cast_split', 'director_split', 'country_split', 'listed_in_split'],
      dtype='object')

In [26]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,cast_split,director_split,country_split,listed_in_split
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown cast,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Unknown cast,Kirsten Johnson,United States,Documentaries
1,s2,TV Show,Blood & Water,Unknown director,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Ama Qamata,Unknown director,South Africa,International TV Shows
1,s2,TV Show,Blood & Water,Unknown director,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Ama Qamata,Unknown director,South Africa,TV Dramas
1,s2,TV Show,Blood & Water,Unknown director,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Ama Qamata,Unknown director,South Africa,TV Mysteries
1,s2,TV Show,Blood & Water,Unknown director,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Khosi Ngema,Unknown director,South Africa,International TV Shows


In [27]:
df.shape

(201991, 16)

In [28]:
small_df=df[['show_id','title','release_year','type']]
small_df.head(5)

Unnamed: 0,show_id,title,release_year,type
0,s1,Dick Johnson Is Dead,2020,Movie
1,s2,Blood & Water,2021,TV Show
1,s2,Blood & Water,2021,TV Show
1,s2,Blood & Water,2021,TV Show
1,s2,Blood & Water,2021,TV Show


In [29]:
small_df.drop_duplicates(inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_df.drop_duplicates(inplace = True)


In [30]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description',
       'cast_split', 'director_split', 'country_split', 'listed_in_split'],
      dtype='object')

In [31]:
df["type"].value_counts()

Movie      145843
TV Show     56148
Name: type, dtype: int64

In [32]:
movie_count_by_country = df[df["type"] == "Movie"].groupby("country")["title"].nunique().sort_values(ascending = False)
movie_count_by_country

country
United States                                                2058
India                                                         893
Unknown country                                               440
United Kingdom                                                206
Canada                                                        122
                                                             ... 
Canada, United States, United Kingdom, France, Luxembourg       0
United Kingdom, Australia                                       0
Spain, Germany, Denmark, United States                          0
Chile, Italy                                                    0
Philippines, Singapore, Indonesia                               0
Name: title, Length: 749, dtype: int64

In [33]:
TV_shows_count_country = df[df["type"] == "TV Show"].groupby("country")["title"].nunique().sort_values(ascending = False)
TV_shows_count_country.head(10)

country
United States      760
Unknown country    391
United Kingdom     213
Japan              169
South Korea        158
India               79
Taiwan              68
Canada              59
France              49
Spain               48
Name: title, dtype: int64

In [34]:
df.dtypes

show_id              object
type               category
title                object
director             object
cast                 object
country            category
date_added           object
release_year          int64
rating             category
duration             object
listed_in            object
description          object
cast_split           object
director_split       object
country_split        object
listed_in_split      object
dtype: object

In [35]:
df['date_added'] = df['date_added'].str.replace(',', ', ', regex=False)  # Ensure space after comma
df['date_added'] = df['date_added'].str.replace('  ', ' ', regex=False)  # Remove double spaces
df['date_added'] = pd.to_datetime(df['date_added'], format='%B %d, %Y', errors='coerce')
df['week_added'] = df['date_added'].dt.strftime('%Y-%U')


In [36]:
df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,cast_split,director_split,country_split,listed_in_split,week_added
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,Unknown cast,United States,2021-09-25,2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",Unknown cast,Kirsten Johnson,United States,Documentaries,2021-38
1,s2,TV Show,Blood & Water,Unknown director,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Ama Qamata,Unknown director,South Africa,International TV Shows,2021-38
1,s2,TV Show,Blood & Water,Unknown director,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Ama Qamata,Unknown director,South Africa,TV Dramas,2021-38
1,s2,TV Show,Blood & Water,Unknown director,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Ama Qamata,Unknown director,South Africa,TV Mysteries,2021-38
1,s2,TV Show,Blood & Water,Unknown director,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,2021-09-24,2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",Khosi Ngema,Unknown director,South Africa,International TV Shows,2021-38
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8806,s8807,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,2019-03-02,2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...,Anita Shabdish,Mozez Singh,India,International Movies,2019-08
8806,s8807,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,2019-03-02,2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...,Anita Shabdish,Mozez Singh,India,Music & Musicals,2019-08
8806,s8807,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,2019-03-02,2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...,Chittaranjan Tripathy,Mozez Singh,India,Dramas,2019-08
8806,s8807,Movie,Zubaan,Mozez Singh,"Vicky Kaushal, Sarah-Jane Dias, Raaghav Chanan...",India,2019-03-02,2015,TV-14,111 min,"Dramas, International Movies, Music & Musicals",A scrappy but poor boy worms his way into a ty...,Chittaranjan Tripathy,Mozez Singh,India,International Movies,2019-08
