# YouTube Success ML: Modeling Readiness

This notebook validates model-readiness assumptions before retraining.

## Objectives
- Verify feature/target contract integrity
- Validate train/test split behavior
- Review categorical cardinality
- Inspect target distributions

## 1) Environment Setup

In [None]:
from pathlib import Path
import sys

import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split

ROOT = Path.cwd().resolve()
if not (ROOT / "src").exists() and (ROOT.parent / "src").exists():
    ROOT = ROOT.parent
sys.path.insert(0, str(ROOT / "src"))

from youtube_success_ml.config import DEFAULT_DATA_PATH
from youtube_success_ml.data.loader import load_dataset
from youtube_success_ml.models.supervised import FEATURE_COLUMNS, TARGET_COLUMNS

## 2) Load Dataset + Contract Check

In [None]:
df = load_dataset(DEFAULT_DATA_PATH)

print(f"rows: {len(df)}")
print(f"columns: {len(df.columns)}")
print("feature columns:", FEATURE_COLUMNS)
print("target columns :", TARGET_COLUMNS)

In [None]:
missing_feature_cols = [c for c in FEATURE_COLUMNS if c not in df.columns]
missing_target_cols = [c for c in TARGET_COLUMNS.values() if c not in df.columns]

print("missing features:", missing_feature_cols)
print("missing targets :", missing_target_cols)

In [None]:
contract_snapshot = df[FEATURE_COLUMNS + list(TARGET_COLUMNS.values())].head(10)
contract_snapshot

## 3) Train/Test Split Sanity

In [None]:
X = df[FEATURE_COLUMNS]
y = df["growth_target"]
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
)

print(f"train size: {len(X_train)}")
print(f"test size : {len(X_test)}")

## 4) Categorical Coverage

In [None]:
print("category cardinality:", df["category"].nunique())
print("country cardinality :", df["country"].nunique())

df[["category", "country"]].agg(["nunique"])

## 5) Target Distribution Snapshot

In [None]:
target_snapshot = df[["subscribers", "highest_yearly_earnings", "growth_target"]].copy()
target_snapshot = target_snapshot.melt(var_name="target", value_name="value")

px.box(target_snapshot, x="target", y="value", points=False, title="Target Distribution Snapshot")

## 6) Processed CSV Availability Check

In [None]:
processed_csv = ROOT / "data" / "global_youtube_statistics_processed.csv"
if processed_csv.exists():
    check_df = pd.read_csv(processed_csv)
    print("processed csv exists:", processed_csv)
    print("processed csv shape :", check_df.shape)
else:
    print("processed csv not found; run analysis/scripts/export_processed_dataset.py")