# 🎬 IMDb Movie Analytics – EDA Notebook

This notebook replicates a classic analyst workflow: **load → clean → explore → visualise**.

- Works with the Kaggle IMDb Movies dataset (5,000+ rows), **or** the included small `data/imdb_movies_sample.csv` so you can run it immediately.
- Charts are made with **matplotlib** (no seaborn), one chart per figure.
- Outputs are saved to the `visuals/` folder so they can be shown in your GitHub README.

> Tip: Replace the sample CSV with your Kaggle file (e.g., `data/imdb_movies.csv`) and update `DATA_PATH` below.


In [None]:
# Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Display options
pd.set_option("display.max_columns", 50)

# Paths
DATA_PATHS = [
    "data/imdb_movies.csv",           # <- put your Kaggle CSV here (recommended)
    "data/imdb_movies_sample.csv"     # fallback sample (already included)
]
VISUALS_DIR = "visuals"
os.makedirs(VISUALS_DIR, exist_ok=True)

# Helper
def to_number(val):
    """Convert strings like '$120,000,000' to float 120000000. Returns NaN on failure."""
    if isinstance(val, str):
        val = val.replace("$","").replace(",","").strip()
        if val == "": return np.nan
        try:
            return float(val)
        except:
            return np.nan
    try:
        return float(val)
    except:
        return np.nan


In [None]:
# Load data
import pathlib

df = None
used_path = None
for p in DATA_PATHS:
    if pathlib.Path(p).exists():
        df = pd.read_csv(p)
        used_path = p
        break

if df is None:
    raise FileNotFoundError("No dataset found. Please add your Kaggle CSV to 'data/imdb_movies.csv'.")

print(f"Loaded: {used_path}")
print(df.head())

In [None]:
# Basic cleaning
df_clean = df.copy()

# Standard column name normalisation (optional)
df_clean.columns = [c.strip().replace(" ", "_") for c in df_clean.columns]

# Identify probable columns (handles different Kaggle schemas)
possible_budget_cols = [c for c in df_clean.columns if c.lower() in ("budget", "budget_usd", "budget_musd") or "budget" in c.lower()]
possible_gross_cols  = [c for c in df_clean.columns if "world" in c.lower() and "gross" in c.lower()] +                        [c for c in df_clean.columns if c.lower() in ("worldwide_gross", "gross_worldwide")]
possible_rating_cols = [c for c in df_clean.columns if "rating" in c.lower() or c.lower() in ("imdb_rating", "average_rating")]
possible_meta_cols   = [c for c in df_clean.columns if "meta" in c.lower()]

budget_col  = possible_budget_cols[0]  if possible_budget_cols else None
gross_col   = possible_gross_cols[0]   if possible_gross_cols else None
rating_col  = possible_rating_cols[0]  if possible_rating_cols else None
metas_col   = possible_meta_cols[0]    if possible_meta_cols else None

# Convert currency strings to numeric
if budget_col:
    df_clean["Budget_num"] = df_clean[budget_col].apply(to_number)
if gross_col:
    df_clean["Worldwide_num"] = df_clean[gross_col].apply(to_number)

# Parse year if available
for cand in ["Year","Release_Year","year","release_year"]:
    if cand in df_clean.columns:
        df_clean["Release_Year"] = pd.to_numeric(df_clean[cand], errors="coerce")
        break

# Ratings & Metascore to numeric
if rating_col:
    df_clean["Average_Rating_num"] = pd.to_numeric(df_clean[rating_col], errors="coerce")
if metas_col:
    df_clean["Metascore_num"] = pd.to_numeric(df_clean[metas_col], errors="coerce")

# Fill missing numeric with median (robust)
for col in [c for c in df_clean.columns if c.endswith("_num")]:
    med = df_clean[col].median(skipna=True)
    df_clean[col] = df_clean[col].fillna(med)

print("\nNumeric summary (post-clean):\n", df_clean[[c for c in df_clean.columns if c.endswith('_num')]].describe())

In [None]:
# Correlation matrix on numeric analysis columns
num_cols = [c for c in ["Budget_num","Worldwide_num","Average_Rating_num","Metascore_num"] if c in df_clean.columns]
corr = df_clean[num_cols].corr()
print("\nCorrelation matrix:\n", corr)

# Save to CSV (optional)
corr.to_csv("visuals/correlation_matrix.csv", index=True)

In [None]:
# Plot 1: Budget vs Worldwide Gross
if set(["Budget_num","Worldwide_num"]).issubset(df_clean.columns):
    plt.figure()
    plt.scatter(df_clean["Budget_num"], df_clean["Worldwide_num"])
    plt.xlabel("Budget (USD)")
    plt.ylabel("Worldwide Gross (USD)")
    plt.title("Budget vs Worldwide Gross")
    plt.savefig(f"{VISUALS_DIR}/budget_vs_revenue.png", bbox_inches="tight")
    plt.show()
else:
    print("Budget or Worldwide columns not found for scatter plot.")

In [None]:
# Plot 2: Rating distribution
col = None
for cand in ["Average_Rating_num", "IMDB_Rating", "imdbRating"]:
    if cand in df_clean.columns:
        col = cand
        break
if col is None and "Average_Rating" in df_clean.columns:
    df_clean["Average_Rating_num"] = pd.to_numeric(df_clean["Average_Rating"], errors="coerce")
    col = "Average_Rating_num"

if col:
    plt.figure()
    plt.hist(df_clean[col].dropna(), bins=20)
    plt.xlabel("IMDb Average Rating")
    plt.ylabel("Count of Movies")
    plt.title("Rating Distribution")
    plt.savefig(f"{VISUALS_DIR}/rating_distribution.png", bbox_inches="tight")
    plt.show()
else:
    print("No rating column found for histogram.")

In [None]:
# Save cleaned dataset for reproducibility
out_path = "data/imdb_movies_cleaned.csv"
df_clean.to_csv(out_path, index=False)
print(f"Saved cleaned dataset to: {out_path}")