# Clustering Insights Notebook

This notebook trains KMeans/DBSCAN clustering pipelines, inspects archetypes, and visualizes channel behavior segments.

## 1) Setup

In [None]:
from pathlib import Path
import sys

import pandas as pd
import plotly.express as px

ROOT = Path.cwd().resolve()
if not (ROOT / "src").exists() and (ROOT.parent / "src").exists():
    ROOT = ROOT.parent
sys.path.insert(0, str(ROOT / "src"))

from youtube_success_ml.config import TrainingConfig
from youtube_success_ml.data.loader import load_dataset
from youtube_success_ml.models.clustering import train_clustering_bundle


## 2) Train Clustering Bundle

In [None]:
df = load_dataset()
cfg = TrainingConfig()
bundle, enriched = train_clustering_bundle(df, config=cfg)
len(bundle.cluster_profiles), enriched.shape

## 3) KMeans Archetype Profiles

In [None]:
profiles = pd.DataFrame(bundle.cluster_profiles)
profiles

## 4) Cluster Size and Growth

In [None]:
px.bar(profiles, x="archetype", y="size", color="avg_growth", title="Cluster Size by Archetype")

## 5) Upload vs Growth by KMeans Cluster

In [None]:
sample = enriched.sample(min(700, len(enriched)), random_state=42)
px.scatter(sample, x="uploads", y="growth_target", color="kmeans_cluster", hover_data=["category", "country"], title="Uploads vs Growth by KMeans Cluster")

## 6) DBSCAN Membership Snapshot

In [None]:
dbscan_counts = (
    enriched.groupby("dbscan_cluster", as_index=False)
    .agg(size=("youtuber", "count"), avg_growth=("growth_target", "mean"))
    .sort_values("size", ascending=False)
)
dbscan_counts

## 7) Archetype Dominant Category Matrix

In [None]:
matrix = (
    enriched.groupby(["kmeans_cluster", "category"], as_index=False)
    .size()
    .sort_values(["kmeans_cluster", "size"], ascending=[True, False])
)
matrix.head(25)