In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load the data
df = pd.read_csv('../data/netflix_shows_complete.csv')

In [3]:
# Check missing data percentage
missing_pct = (df.isnull().sum()/len(df))* 100
print(missing_pct[missing_pct>0].sort_values(ascending=False))

avg_episode_runtime        61.0
created_by                 17.8
us_content_rating           7.6
keywords                    4.2
imdb_id                     1.2
homepage                    0.8
last_air_date               0.4
days_since_last_episode     0.4
first_air_date              0.2
show_age_days               0.2
dtype: float64


In [4]:
# Fill categorical data with "Unknown"
df['created_by'] = df['created_by'].fillna('Unknown')
df['us_content_rating'] = df['us_content_rating'].fillna('Unknown')
df['keywords'] = df['keywords'].fillna('Unknown')
df['imdb_id'] = df['imdb_id'].fillna('Unknown')
df['homepage'] = df['homepage'].fillna('Unknown')
df['last_air_date'] = df['last_air_date'].fillna('Unknown')
df['first_air_date'] = df['first_air_date'].fillna('Unknown')

In [5]:
# Drop column with over 50% of missing data
df_new = df.drop('avg_episode_runtime', axis=1)

In [6]:
# Drop rows where minimal data is missing
df_new = df_new.dropna(subset = ['show_age_days', 'days_since_last_episode'])

In [7]:
print(f"After cleaning: {len(df_new)} shows remain")

After cleaning: 498 shows remain


In [8]:
# Defining target variable for prediction
df['is_canceled'] = (df['status'] == 'Canceled').astype(int)

In [9]:
# Define weighted rating system using Bayesian Average
df_new['weighted_rating'] = (((df_new['vote_count'])/((df_new['vote_count']) + (68.250000))) * df_new['vote_average']) + (((68.250000)/((df_new['vote_count']) + (68.250000))) * 7.407288)

In [10]:
print(df_new['weighted_rating'])

0      8.596016
1      7.402771
2      7.129230
3      6.817085
4      8.432416
         ...   
495    7.253645
496    7.026082
497    7.185167
498    7.183918
499    7.013802
Name: weighted_rating, Length: 498, dtype: float64


In [11]:
print(df_new['weighted_rating'])

0      8.596016
1      7.402771
2      7.129230
3      6.817085
4      8.432416
         ...   
495    7.253645
496    7.026082
497    7.185167
498    7.183918
499    7.013802
Name: weighted_rating, Length: 498, dtype: float64


In [12]:
# Categorize ratings
def rating_category(rating):
    if rating >= 8.0:
        return 'Excellent'
    elif rating >= 6.0:
        return 'Good'
    elif rating >= 4.0:
        return 'Average'
    else:
        return 'Poor'

In [13]:
df_new['rating_category'] = df_new['weighted_rating'].apply(rating_category)

In [14]:
print(df_new['rating_category'])

0      Excellent
1           Good
2           Good
3           Good
4      Excellent
         ...    
495         Good
496         Good
497         Good
498         Good
499         Good
Name: rating_category, Length: 498, dtype: object


In [15]:
# Show longevity
df_new['years_since_premiere'] = (df_new['show_age_days']/365)

In [16]:
print(df_new['years_since_premiere'])

0      9.526027
1      0.019178
2      0.035616
3      0.016438
4      9.997260
         ...   
495    7.876712
496    7.186301
497    4.328767
498    1.243836
499    3.736986
Name: years_since_premiere, Length: 498, dtype: float64


In [17]:
# Recent vs older shows
df_new['is_recent'] = (df_new['years_since_premiere'] < 5).astype(int)

In [18]:
print(df_new['is_recent'])

0      0
1      1
2      1
3      1
4      0
      ..
495    0
496    0
497    1
498    1
499    1
Name: is_recent, Length: 498, dtype: int64


In [19]:
# Has high popularity
df_new['is_popular'] = (df_new['popularity'] > df_new['popularity'].median()).astype(int)

In [20]:
print(df_new['is_popular'])

0      1
1      1
2      1
3      1
4      1
      ..
495    0
496    0
497    0
498    0
499    0
Name: is_popular, Length: 498, dtype: int64


In [21]:
# Number of genres
df_new['genre_count'] = df_new['genres'].apply(lambda x: len(str(x).split(',')) if pd.notna(x) else 0)

In [22]:
print(df_new['genre_count'])

0      3
1      3
2      3
3      2
4      2
      ..
495    1
496    2
497    2
498    3
499    3
Name: genre_count, Length: 498, dtype: int64


In [23]:
print(df_new['first_air_date'])

0      2016-07-15
1      2026-01-14
2      2026-01-08
3      2026-01-15
4      2016-01-25
          ...    
495    2018-03-09
496    2018-11-16
497    2021-09-24
498    2024-10-24
499    2022-04-28
Name: first_air_date, Length: 498, dtype: object


In [24]:
# Convert dates to datetime objects
df_new['first_air_date'] = pd.to_datetime(df_new['first_air_date'], format='%Y-%m-%d')
df_new['last_air_date'] = pd.to_datetime(df_new['last_air_date'], format='%Y-%m-%d')

In [25]:
print(df_new['first_air_date'])

0     2016-07-15
1     2026-01-14
2     2026-01-08
3     2026-01-15
4     2016-01-25
         ...    
495   2018-03-09
496   2018-11-16
497   2021-09-24
498   2024-10-24
499   2022-04-28
Name: first_air_date, Length: 498, dtype: datetime64[ns]


In [26]:
print(df_new['last_air_date'])

0     2025-12-31
1     2026-01-14
2     2026-01-08
3     2026-01-15
4     2021-09-10
         ...    
495   2022-10-05
496   2019-05-20
497   2023-02-17
498   2024-10-24
499   2022-09-01
Name: last_air_date, Length: 498, dtype: datetime64[ns]


In [27]:
# Extract year from premiere date
df_new['premiere_year'] = df_new['first_air_date'].dt.year

In [28]:
print(df_new['premiere_year'])

0      2016
1      2026
2      2026
3      2026
4      2016
       ... 
495    2018
496    2018
497    2021
498    2024
499    2022
Name: premiere_year, Length: 498, dtype: int32


In [29]:
print(df.dtypes)

id                           int64
name                        object
first_air_date              object
popularity                 float64
vote_average               float64
vote_count                   int64
status                      object
in_production                 bool
num_seasons                  int64
num_episodes                 int64
genres                      object
type                        object
original_language           object
origin_country              object
avg_episode_runtime        float64
show_age_days              float64
days_since_last_episode    float64
keywords                    object
last_air_date               object
us_content_rating           object
imdb_id                     object
created_by                  object
homepage                    object
is_canceled                  int64
dtype: object


In [31]:
df.to_csv('../data/netflix_shows_clean.csv', index=False)
print("Clean dataset saved!")

Clean dataset saved!
