In [115]:
import pandas as pd

df = pd.read_csv("netflix_titles.csv")

# Checking Missing Values
df.isna().sum()



df['duration'] = pd.to_numeric(df['duration'].astype(str).str.extract('(\\d+)')[0], errors='coerce')
mean_duration_value = df['duration'].mean()

# Fill Missing Values
df.fillna({
    "director" : "Unknown",
    "cast" : "Unknown",
    "country" : "Unknown",
    "date_added" : "Unknown",
    "rating" : df["rating"].mode()[0],
    "duration" : mean_duration_value
},inplace = True)

# Remove Duplicates
df = df.drop_duplicates()

# Feature Engineering
df["date_added"] = pd.to_datetime(df["date_added"],errors = "coerce")
df["year_added"] = df["date_added"].dt.year
df["month_added"] = df["date_added"].dt.month_name()


# Content Overview
df["type"].value_counts() #How many TV shows vs Movie
df["country"].value_counts().head(10) #Top 10 countries producing Netflix content
df["year_added"].value_counts().sort_index() #Content added per year

# Rating Analysis
df.groupby("type")["rating"].value_counts().head(10) #Most common ratings for Movies and TV Shows

# Duration Insights
df[df["type"]== "Movie"]["duration"].mean() #Avg Movie duration
df[df["type"]=="Movie"].sort_values("duration",ascending = False)[["title","duration"]].head(10) #Top 10 longest movies


# Genere Analysis
df["listed_in"].str.split(", ").explode().value_counts().head(15) # Most common genres

# Director and actor analysis
df[df["director"]!= "unknown"]["director"].value_counts().head(10) # Top 10 directors with most content
df[df["cast"]!= "unknown"]["cast"].value_counts().head(10) # Top 10 Actors

df.to_csv("netflix_titles_cleaned.csv",index = False)
