In [None]:
"""Loads Netflix dataset and performs basic cleaning & summaries.
- Drops duplicates and strips column whitespace
- Fills missing text with 'Unknown' and rating with 'Not Rated'
- Normalizes date_added to YYYY-MM-DD
- Saves recent movies and year-with-most-releases subsets
- Computes basic stats and saves summary reports
"""

#make sure to save .csv file in the same directory as this script
# Load dataset
import pandas as pd

df = pd.read_csv("netflix_titles.csv")

print(df.head())


  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [2]:
print(df.info()) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB
None


In [3]:
print(df.describe())

       release_year
count   8807.000000
mean    2014.180198
std        8.819312
min     1925.000000
25%     2013.000000
50%     2017.000000
75%     2019.000000
max     2021.000000


In [4]:
# 1. Explore the dataset
print("Shape of the DataFrame:", df.shape)
print("Columns in the DataFrame:", df.columns)


Shape of the DataFrame: (8807, 12)
Columns in the DataFrame: Index(['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in', 'description'],
      dtype='object')


In [5]:
#2. how many unique countries are represented in the dataset?
unique_countries = df['country'].nunique()
print("Number of unique countries:", unique_countries)

Number of unique countries: 748


In [6]:
#3. average relaease year (igore null values(NaN))
average_release_year = df['release_year'].mean()
print("Average release year:", round(average_release_year,2))

Average release year: 2014.18


In [7]:
#4. how many rows have missing values in the 'release_year' column?
print("Number of rows with missing values in 'release_year':", df['release_year'].isnull().sum())

Number of rows with missing values in 'release_year': 0


In [8]:
#5.Save cleaned subset (only movies from 2010 onwards)
recent_movies = df[df['release_year'] >= 2010]
recent_movies.to_csv("recent_netflix_movies.csv", index=False) 
print("Recent movies saved to 'recent_netflix_movies.csv'",len(recent_movies),"rows")

Recent movies saved to 'recent_netflix_movies.csv' 7472 rows


In [9]:
#Find out which year had the most Netflix releases
release_per_year = df.groupby('release_year').size()
print( release_per_year)


release_year
1925       1
1942       2
1943       3
1944       3
1945       4
        ... 
2017    1032
2018    1147
2019    1030
2020     953
2021     592
Length: 74, dtype: int64


In [10]:
max_year = release_per_year.idxmax()
max_count = release_per_year.max()
print(f"The year with most releases is {max_year} with {max_count} titles.")

The year with most releases is 2018 with 1147 titles.


In [11]:
df_max_year = df[df["release_year"] == max_year]

In [12]:
output_file = "netflix_most_releases.csv"
df_max_year.to_csv(output_file, index=False)   
print(f"Data for the year {max_year} saved to '{output_file}' with {len(df_max_year)} rows.")

Data for the year 2018 saved to 'netflix_most_releases.csv' with 1147 rows.


In [None]:
# Replace blank/whitespace strings and NaN in object (text) columns with 'Unknown'
obj_cols = df.select_dtypes(include=['object']).columns
df[obj_cols] = df[obj_cols].apply(lambda s: s.fillna('').str.strip().replace('', 'Unknown'))
print(df[obj_cols].head())
 

In [13]:
df['date_added'] = pd.to_datetime(df['date_added'].str.strip(), errors='coerce', infer_datetime_format=True).dt.strftime('%Y-%m-%d')

output_file = "netflix_date_added_clean.csv"
df.to_csv(output_file, index=False) 

  df['date_added'] = pd.to_datetime(df['date_added'].str.strip(), errors='coerce', infer_datetime_format=True).dt.strftime('%Y-%m-%d')
