In [95]:
import pandas as pd
import numpy as np

## Preprocessing

In [96]:
data = pd.read_csv('src/netflix_titles.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [97]:
missing_val = data.isna().sum()
missing_val

show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64

In [98]:
fill_missing_val = ['director', 'cast', 'country', 'duration']
for col in fill_missing_val:
    data[col] = data[col].fillna('Unknown')
    
# convert the format add the feature 'year_added' for the third visualization graph
data['date_added'] = pd.to_datetime(data['date_added'], errors='coerce')
data['year_added'] = data['date_added'].dt.year


data = data.dropna(subset=['date_added'])
data = data.dropna(subset=['year_added'])
data['rating'] = data['rating'].fillna(data['rating'].mode()[0])

print(data.isna().sum())

show_id         0
type            0
title           0
director        0
cast            0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
description     0
year_added      0
dtype: int64


## First graph

In [99]:
import plotly.graph_objects as go

type_counts = data['type'].value_counts()

fig = go.Figure()
fig.add_trace(go.Bar(
    x=type_counts.index, 
    y=type_counts.values,
    width=[0.3, 0.3] 
    ))
fig.update_layout(
    title = "Distribution of Type (Movie vs TV Show)",
    xaxis_title="Type",
    yaxis_title="Count",
)
fig.show()


## Second graph

In [100]:
movies = data[data['type'] == 'Movie']
tv_shows = data[data['type'] == 'TV Show']


In [101]:
# count country (get 10 values)
movies_country_counts = movies['country'].value_counts().head(10).drop('Unknown', errors='ignore')
tv_shows_country_counts = tv_shows['country'].value_counts().head(10).drop('Unknown', errors='ignore')
all_country_counts = data['country'].value_counts().head(10).drop('Unknown', errors='ignore')

In [102]:
fig2 = go.Figure()


# All
fig2.add_trace(go.Bar(
    x=all_country_counts.values,
    y=all_country_counts.index,
    orientation='h',
    name='All',
    visible=True
))

# Move
fig2.add_trace(go.Bar(
    x=movies_country_counts.values,
    y=movies_country_counts.index,
    orientation='h',
    name='Movies',
    visible=False
))

# tv shows
fig2.add_trace(go.Bar(
    x=tv_shows_country_counts.values,
    y=tv_shows_country_counts.index,
    orientation='h',
    name='TV Shows',
    visible=False
))

#Dropdown
fig2.update_layout(
    updatemenus=[
        # active = 0 -> default with the first option (All)
        dict(active=0,buttons=list([
        dict(label="All",method="update", args=[{"visible": [True, False, False]}, {"title": "Top 10 Countries - All Titles"}]),
        dict(label="Movies", method="update", args=[{"visible": [False, True, False]}, {"title": "Top 10 Countries - Movies Only"}]),
        dict(label="TV Shows", method="update", args=[{"visible": [False, False, True]}, {"title": "Top 10 Countries - TV Shows Only"}])
            ]),
        direction="down",
        showactive=True,
        )
    ],
    title="Top 10 Countries - All Titles",
    xaxis_title="Number of Titles",
    yaxis_title="Country",
    height=600
)

fig2.show()

From the bar chart, we observe that the United States leads in the production of Netflix content, followed by India and the United Kingdom. This indicates a strong dominance of English-speaking countries on the platform, with significant representation from India as well.

### Third visualizaiton

In [105]:
movies_per_year = movies['year_added'].value_counts().sort_index()
tv_shows_per_year = tv_shows['year_added'].value_counts().sort_index()
movies_per_year

year_added
2008.0       1
2009.0       2
2010.0       1
2011.0      13
2012.0       3
2013.0       6
2014.0      19
2015.0      56
2016.0     253
2017.0     839
2018.0    1237
2019.0    1424
2020.0    1284
2021.0     993
Name: count, dtype: int64

In [110]:
fig3 = go.Figure()

# Movies
fig3.add_trace(go.Scatter(x=movies_per_year.index, y=movies_per_year.values, mode='lines + markers', name='Movies'))
# TV shows
fig3.add_trace(go.Scatter(x=tv_shows_per_year.index, y=tv_shows_per_year.values, mode='lines + markers', name='TV shows'))

fig3.update_layout(
    title='Number of Titles Added to Netflix Over Years',
    xaxis_title='Year',
    yaxis_title='Number of Titles',
    updatemenus=[
        dict(
            direction='down',
            x=1,
            y=1.15,
            showactive=True,
            buttons=list([
                dict(label="All",
                     method="update",
                     args=[{"visible": [True, True]},
                           {"title": "Number of Titles Added to Netflix Over Years"}]),
                dict(label="Movies",
                     method="update",
                     args=[{"visible": [True, False]},
                           {"title": "Number of Movies Added to Netflix Over Years"}]),
                dict(label="TV Shows",
                     method="update",
                     args=[{"visible": [False, True]},
                           {"title": "Number of TV Shows Added to Netflix Over Years"}]),

            ]),
        )
    ]
)


The line chart shows that Netflix significantly expanded its content library after 2015, peaking around 2019-2020. This rapid growth reflects Netflix's aggressive global expansion and investment in original content.


From the analysis, it is evident that Netflix's content is predominantly produced by the United States, with India and the UK following closely. Furthermore, Netflix has seen a rapid increase in the number of shows and movies added each year, especially during 2018–2020. This reflects the platform's strategy of scaling globally and diversifying its library.
