# Feature Engineering Audit

This notebook validates engineered fields (`age`, `growth_target`) and inspects feature coverage for model inputs.

## 1) Setup

In [None]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

ROOT = Path.cwd().resolve()
if not (ROOT / "src").exists() and (ROOT.parent / "src").exists():
    ROOT = ROOT.parent
sys.path.insert(0, str(ROOT / "src"))

from youtube_success_ml.config import DEFAULT_DATA_PATH
from youtube_success_ml.data.loader import load_raw_dataset, load_dataset

pd.set_option("display.max_columns", 160)
pd.set_option("display.width", 200)


## 2) Load Processed Dataset

In [None]:
df = load_dataset(DEFAULT_DATA_PATH)
feature_cols = ["uploads", "category", "country", "age"]
engineered_cols = ["age", "growth_target"]
print("rows:", len(df))
print("feature cols present:", all(c in df.columns for c in feature_cols))
print("engineered cols present:", all(c in df.columns for c in engineered_cols))

## 3) Engineered Field Diagnostics

In [None]:
diag = {
    "age_min": int(df["age"].min()),
    "age_max": int(df["age"].max()),
    "age_median": float(df["age"].median()),
    "growth_target_min": float(df["growth_target"].min()),
    "growth_target_max": float(df["growth_target"].max()),
    "growth_target_median": float(df["growth_target"].median()),
}
pd.DataFrame([diag]).T.rename(columns={0: "value"})

## 4) Feature Cardinality and Coverage

In [None]:
cardinality = pd.DataFrame({
    "feature": ["category", "country"],
    "nunique": [df["category"].nunique(), df["country"].nunique()],
    "top_value": [df["category"].mode().iloc[0], df["country"].mode().iloc[0]],
    "top_share_pct": [
        float(df["category"].value_counts(normalize=True).iloc[0] * 100.0),
        float(df["country"].value_counts(normalize=True).iloc[0] * 100.0),
    ],
})
cardinality

## 5) Relationship View: Uploads vs Growth

In [None]:
sample = df.sample(min(500, len(df)), random_state=42)
px.scatter(sample, x="uploads", y="growth_target", color="category", hover_data=["country", "age"], title="Uploads vs Growth Target")

## 6) Relationship View: Age vs Subscribers

In [None]:
sample = df.sample(min(500, len(df)), random_state=7)
px.scatter(sample, x="age", y="subscribers", color="country", title="Age vs Subscribers (Sampled)")

## 7) Export Preview Contract

In [None]:
preview_cols = ["youtuber", "uploads", "category", "country", "age", "subscribers", "highest_yearly_earnings", "growth_target"]
df[preview_cols].head(15)