In [None]:
# ==============================================================
# 🎬 Netflix Dataset Analysis Project (Final Version)
# Author: Darshil Halvadia
# Type: Major Project - Data Analytics
# ==============================================================

In [None]:
# ===============================================================
# 1️⃣ Import Libraries
# ===============================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('pastel')




In [None]:
## 2️⃣ Load Dataset

try:
    df = pd.read_csv(r"C:\Users\DARSHIL\Downloads\Netflix Dataset.csv", encoding='utf-8')
    print("✅ Dataset Loaded Successfully!")
except FileNotFoundError:
    print("❌ ERROR: Please make sure 'Netflix Dataset.csv' is in the same folder.")
    exit()

print(f"\nTotal Records: {df.shape[0]}  |  Total Columns: {df.shape[1]}")
print("\nColumn Names:", list(df.columns))
print("\nPreview:\n", df.head(2))


In [None]:
# ---------------------------------------------------------------
# Data Overview
# ---------------------------------------------------------------
print("\nColumns in Dataset:\n", df.columns.tolist())
print("\nMissing Values:\n", df.isnull().sum())

df.info()


In [None]:
## 3️⃣ Data Cleaning & Preparation
# Rename columns
df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]

# Fill missing values
for col in ['director', 'cast', 'country', 'rating', 'listed_in']:
    if col in df.columns:
        df[col] = df[col].fillna('Unknown')

# Handle date and release_year
if 'date_added' in df.columns:
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
    df['year_added'] = df['date_added'].dt.year
else:
    df['year_added'] = np.nan

if 'release_year' in df.columns:
    df['release_year'] = pd.to_numeric(df['release_year'], errors='coerce')
    df['year_added'].fillna(df['release_year'], inplace=True)

print("\n✅ Data Cleaning Completed!")


In [None]:
## 4️⃣ Movies vs TV Shows
if 'type' in df.columns:
    plt.figure(figsize=(6,6))
    df['type'].value_counts().plot.pie(autopct='%1.1f%%', startangle=140, textprops={'fontsize': 12})
    plt.title("Distribution of Movies vs TV Shows", fontsize=14, weight='bold')
    plt.ylabel('')
    plt.show()

    print("\n🧠 Insight:")
    print("About 70% are Movies and 30% are TV Shows — Netflix is more movie-focused but series are increasing.")


In [None]:
## 5️⃣ Content Growth Over the Years
if 'year_added' in df.columns:
    plt.figure(figsize=(10,5))
    sns.countplot(x='year_added', data=df, order=sorted(df['year_added'].dropna().unique()))
    plt.title("Netflix Content Added Over the Years", fontsize=14, weight='bold')
    plt.xlabel("Year Added")
    plt.ylabel("Number of Titles")
    plt.xticks(rotation=45)
    plt.show()

    print("\n🧠 Insight:")
    print("Content growth accelerated post-2015, peaking between 2017–2020 — Netflix’s global expansion phase.")


In [None]:
## 6️⃣ Top 10 Genres
if 'listed_in' in df.columns:
    genres = df['listed_in'].str.split(',').explode().str.strip()
    top_genres = genres.value_counts().head(10)

    plt.figure(figsize=(10,6))
    sns.barplot(x=top_genres.values, y=top_genres.index)
    plt.title("Top 10 Genres on Netflix", fontsize=14, weight='bold')
    plt.xlabel("Number of Titles")
    plt.ylabel("Genre")
    plt.show()

    print("\n🧠 Insight:")
    print("Most popular genres: Dramas, International Movies, Comedies, Documentaries, and Action & Adventure.")


In [None]:
## 7️⃣ Top 10 Countries
if 'country' in df.columns:
    countries = df['country'].str.split(',').explode().str.strip()
    top_countries = countries.value_counts().head(10)

    plt.figure(figsize=(10,6))
    sns.barplot(x=top_countries.values, y=top_countries.index)
    plt.title("Top 10 Countries with Most Titles on Netflix", fontsize=14, weight='bold')
    plt.xlabel("Number of Titles")
    plt.ylabel("Country")
    plt.show()

    print("\n🧠 Insight:")
    print("USA leads in content count, followed by India, UK, and Japan — indicating global reach.")


In [None]:
## 8️⃣ Ratings Distribution
if 'rating' in df.columns:
    plt.figure(figsize=(10,5))
    sns.countplot(x='rating', data=df, order=df['rating'].value_counts().index)
    plt.title("Distribution of Ratings on Netflix", fontsize=14, weight='bold')
    plt.xlabel("Rating")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.show()

    print("\n🧠 Insight:")
    print("Most shows are rated TV-MA or TV-14 — Netflix mainly targets teen and adult audiences.")


In [None]:
## 9️⃣ Ratings by Content Type
if 'rating' in df.columns and 'type' in df.columns:
    plt.figure(figsize=(10,5))
    sns.countplot(x='rating', hue='type', data=df, order=df['rating'].value_counts().index)
    plt.title("Ratings by Content Type", fontsize=14, weight='bold')
    plt.xlabel("Rating")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.legend(title="Type")
    plt.show()

    print("\n🧠 Insight:")
    print("Movies dominate most rating categories, but TV shows are more frequent in TV-14 and TV-MA ratings.")


In [None]:
## 🔟 Movie Duration Analysis
if 'type' in df.columns and 'duration' in df.columns:
    movies = df[df['type'].str.lower() == 'movie'].copy()
    movies['duration_num'] = movies['duration'].str.replace(' min', '', regex=False).astype(float)

    plt.figure(figsize=(10,5))
    sns.histplot(movies['duration_num'], bins=30, kde=True)
    plt.title("Movie Duration Distribution", fontsize=14, weight='bold')
    plt.xlabel("Duration (Minutes)")
    plt.ylabel("Frequency")
    plt.show()

    print("\n🧠 Insight:")
    print("Average movie duration ≈ 100 minutes. Most movies are between 90–120 minutes long.")


In [None]:
## 1️⃣1️⃣ Country vs Genre Heatmap
if 'country' in df.columns and 'listed_in' in df.columns:
    countries = df['country'].str.split(',').explode().str.strip()
    genres = df['listed_in'].str.split(',').explode().str.strip()

    top5_countries = countries.value_counts().head(5).index
    top5_genres = genres.value_counts().head(5).index

    sub_df = df[df['country'].isin(top5_countries)]
    sub_df = sub_df.assign(genre=sub_df['listed_in'].str.split(',')).explode('genre')
    sub_df['genre'] = sub_df['genre'].str.strip()

    heatmap_data = pd.crosstab(sub_df['country'], sub_df['genre'])
    heatmap_data = heatmap_data[top5_genres]

    plt.figure(figsize=(8,5))
    sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='YlGnBu')
    plt.title("Top Countries vs Top Genres", fontsize=14, weight='bold')
    plt.show()

    print("\n🧠 Insight:")
    print("USA dominates Drama/Comedy; India strong in International & Romantic; UK has balanced genre mix.")


In [None]:
## 1️⃣2️⃣ Final Summary
print("\n📈 FINAL INSIGHTS SUMMARY")
print("------------------------------------------------")
print("✅ Movies dominate (~70%), TV Shows growing fast.")
print("✅ Strong content growth during 2017–2020 (global expansion).")
print("✅ Top Genres: Dramas, International Movies, Comedies, Documentaries.")
print("✅ Top Countries: USA, India, UK, Japan.")
print("✅ Audience Ratings: Mostly TV-MA and TV-14 (mature audiences).")
print("✅ Average Movie Duration: ~100 minutes.")
print("✅ Strategy: Focus more on regional & diverse original content to sustain growth.")
print("------------------------------------------------")
print("\n🎯 Project Completed Successfully!")
