In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Set display options for better viewing
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)

In [3]:
# Load the data
df = pd.read_csv('../data/netflix_shows_complete.csv')

In [4]:
# How many rows and columns?
print(f"Dataset shape: {df.shape}")
print(f"We have {df.shape[0]} shows and {df.shape[1]} features")

Dataset shape: (500, 23)
We have 500 shows and 23 features


In [5]:
# What columns do we have?
print(df.columns.tolist())

['id', 'name', 'first_air_date', 'popularity', 'vote_average', 'vote_count', 'status', 'in_production', 'num_seasons', 'num_episodes', 'genres', 'type', 'original_language', 'origin_country', 'avg_episode_runtime', 'show_age_days', 'days_since_last_episode', 'keywords', 'last_air_date', 'us_content_rating', 'imdb_id', 'created_by', 'homepage']


In [6]:
# First few rows
df.head()

Unnamed: 0,id,name,first_air_date,popularity,vote_average,vote_count,status,in_production,num_seasons,num_episodes,genres,type,original_language,origin_country,avg_episode_runtime,show_age_days,days_since_last_episode,keywords,last_air_date,us_content_rating,imdb_id,created_by,homepage
0,66732,Stranger Things,2016-07-15,489.9853,8.6,20366,Ended,False,5,42,"Sci-Fi & Fantasy, Mystery, Action & Adventure",Scripted,en,US,,3477.0,21.0,"monster, small town, indiana, usa, experiment,...",2025-12-31,TV-14,tt4574334,"Ross Duffer, Matt Duffer",https://www.netflix.com/title/80057281
1,308482,Taskaree: The Smuggler's Web,2026-01-14,99.6134,7.3,3,Returning Series,True,1,7,"Crime, Mystery, Drama",Scripted,hi,IN,,7.0,7.0,,2026-01-14,TV-MA,tt35335046,Neeraj Pandey,https://www.netflix.com/title/81406373
2,259731,HIS & HERS,2026-01-08,84.3181,7.006,154,Ended,False,1,6,"Drama, Crime, Mystery",Miniseries,en,US,,13.0,13.0,"loss of loved one, detective, based on novel o...",2026-01-08,TV-MA,tt33035373,William Oldroyd,https://www.netflix.com/title/81662954
3,250505,Agatha Christie's Seven Dials,2026-01-15,79.0975,6.28,75,Ended,False,1,3,"Drama, Mystery",Scripted,en,GB,,6.0,6.0,"based on novel or book, clock, 1920s, thriller",2026-01-15,TV-14,tt31974288,Chris Chibnall,https://www.netflix.com/title/81314952
4,63174,Lucifer,2016-01-25,78.0905,8.437,15263,Ended,False,6,93,"Crime, Sci-Fi & Fantasy",Scripted,en,US,53.5,3649.0,1594.0,"based on comic, los angeles, california, devil...",2021-09-10,TV-14,tt4052886,Tom Kapinos,https://www.netflix.com/title/80057918


In [7]:
# Data types
df.dtypes

id                           int64
name                        object
first_air_date              object
popularity                 float64
vote_average               float64
vote_count                   int64
status                      object
in_production                 bool
num_seasons                  int64
num_episodes                 int64
genres                      object
type                        object
original_language           object
origin_country              object
avg_episode_runtime        float64
show_age_days              float64
days_since_last_episode    float64
keywords                    object
last_air_date               object
us_content_rating           object
imdb_id                     object
created_by                  object
homepage                    object
dtype: object

In [8]:
# Summary statistics
df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count,num_seasons,num_episodes,avg_episode_runtime,show_age_days,days_since_last_episode
count,500.0,500.0,500.0,500.0,500.0,500.0,195.0,499.0,498.0
mean,150615.584,12.228262,7.407288,794.402,2.674,42.44,45.106154,1749.779559,994.333333
std,79825.958017,24.535885,0.845124,1996.612525,3.636781,186.365691,77.57653,1773.519833,907.244424
min,502.0,4.4411,0.0,0.0,1.0,1.0,7.0,-1.0,1.0
25%,81826.5,5.48195,7.0,68.25,1.0,8.0,25.0,531.5,247.25
50%,119003.5,7.29515,7.5,212.0,2.0,16.0,41.0,1419.0,712.5
75%,230019.5,10.97445,7.9645,681.75,3.0,32.0,50.0,2560.0,1548.5
max,310070.0,489.9853,10.0,20366.0,56.0,3161.0,1100.0,20526.0,4191.0


In [9]:
# Missing values
missing = df.isnull().sum()
missing[missing>0]

first_air_date               1
avg_episode_runtime        305
show_age_days                1
days_since_last_episode      2
keywords                    21
last_air_date                2
us_content_rating           38
imdb_id                      6
created_by                  89
homepage                     4
dtype: int64

In [10]:
# Status distribution
print(df['status'].value_counts())

status
Ended               294
Returning Series    138
Canceled             67
In Production         1
Name: count, dtype: int64


#### Initial Insights
- 67 shows have been cancelled, and 138 shows are returning.
- 13.4 % of shows have been canceled.
- Columns with missing data
    - first_air_date: 1 missing entry
    - avg_episode_runtime: 305 missing entries
    - show_age_days: 1 missing entry
    - days_since_last_episode: 2 missing entries
    - keywords: 21 missing entries
    - last_air_date: 2 missing entries
    - us_content_rating: 38 missing entries
    - imdb_id: 6 missing entries
    - created_by: 89 missing entries
    - homepage: 4 missing entries
- Minimum popularity rating: 4.4411
- Maximum popularity rating: 489.9853
