# NBA-22-3: Clustering des Profils de Joueurs

**Objectif:** Identifier les profils de joueurs via clustering non-supervisé

**Approche:** K-Means avec features normalisées

**Résultat attendu:** 4-6 clusters interprétables (shooter, défenseur, all-around...)

## 1. Setup

In [None]:
import sys
sys.path.insert(0, '../src')

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

spark = SparkSession.builder.appName("NBA-Clustering").getOrCreate()
print(f"Spark version: {spark.version}")

## 2. Chargement Données Joueurs

In [None]:
# Charger les données des joueurs (avec métriques avancées)
players_df = spark.read.parquet("../data/silver/players_with_metrics")

print(f"Shape: ({players_df.count()}, {len(players_df.columns)})")
print("\nColonnes disponibles:")
for col_name in sorted(players_df.columns):
    print(f"  - {col_name}")

players_df.show(5)

## 3. Sélection des Features

In [None]:
# Features pour le clustering
clustering_features = [
    # Caractéristiques physiques
    'height_cm',
    'weight_kg',
    
    # Efficacité
    'points_per_minute',
    'assists_per_minute',
    'rebounds_per_minute',
    
    # Shooting
    'fg_pct',
    'three_point_pct',
    'free_throw_pct',
    
    # Défense
    'steals_per_minute',
    'blocks_per_minute',
    
    # Métriques avancées
    'per',  # Player Efficiency Rating
    'ts_pct',  # True Shooting %
    'usg_pct',  # Usage Rate
]

# Filtrer les colonnes existantes
available_features = [c for c in clustering_features if c in players_df.columns]
print(f"\nFeatures utilisées ({len(available_features)}):")
for f in available_features:
    print(f"  - {f}")

## 4. Préparation des Données

In [None]:
# Assembler et normaliser
assembler = VectorAssembler(
    inputCols=available_features,
    outputCol="features",
    handleInvalid="skip"
)

scaler = StandardScaler(
    inputCol="features",
    outputCol="scaled_features",
    withStd=True,
    withMean=True
)

# Appliquer
assembled = assembler.transform(players_df)
scaled = scaler.fit(assembled).transform(assembled)

print("✅ Données normalisées")
scaled.select('full_name', 'scaled_features').show(5, truncate=False)

## 5. Elbow Method - Déterminer k optimal

In [None]:
# Tester différentes valeurs de k
k_range = range(2, 11)
inertias = []
silhouettes = []

for k in k_range:
    print(f"Test k={k}...")
    
    kmeans = KMeans(k=k, seed=42, featuresCol="scaled_features")
    model = kmeans.fit(scaled)
    
    # Inertie (WSSSE)
    inertias.append(model.summary.trainingCost)
    
    # Silhouette score
    predictions = model.transform(scaled)
    evaluator = ClusteringEvaluator(featuresCol="scaled_features")
    silhouette = evaluator.evaluate(predictions)
    silhouettes.append(silhouette)
    
    print(f"  Inertie: {inertias[-1]:.2f}, Silhouette: {silhouettes[-1]:.3f}")

# Visualisation
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Elbow curve
axes[0].plot(k_range, inertias, 'bo-')
axes[0].set_xlabel("Nombre de clusters (k)")
axes[0].set_ylabel("Inertie (WSSSE)")
axes[0].set_title("Elbow Method")
axes[0].grid(True)

# Silhouette score
axes[1].plot(k_range, silhouettes, 'ro-')
axes[1].set_xlabel("Nombre de clusters (k)")
axes[1].set_ylabel("Silhouette Score")
axes[1].set_title("Silhouette Score vs k")
axes[1].grid(True)

plt.tight_layout()
plt.show()

# Meilleur k selon silhouette
best_k = k_range[np.argmax(silhouettes)]
print(f"\nMeilleur k selon Silhouette: {best_k} (score: {max(silhouettes):.3f})")

## 6. Clustering Final

In [None]:
# Choisir k (modifier selon résultats elbow method)
k = 5  # ou best_k

# Entraîner le modèle final
kmeans = KMeans(k=k, seed=42, featuresCol="scaled_features")
model = kmeans.fit(scaled)

# Prédictions
predictions = model.transform(scaled)

# Évaluation
evaluator = ClusteringEvaluator(featuresCol="scaled_features")
silhouette = evaluator.evaluate(predictions)

print(f"✅ Clustering terminé")
print(f"K={k}, Silhouette Score: {silhouette:.3f}")
print(f"Inertie: {model.summary.trainingCost:.2f}")

## 7. Analyse des Clusters

In [None]:
# Convertir en pandas pour analyse
results_pd = predictions.select(
    'full_name', 'position', 'height_cm', 'weight_kg',
    *[c for c in available_features if c not in ['height_cm', 'weight_kg']],
    'prediction'
).toPandas()

# Renommer la colonne de cluster
results_pd['cluster'] = results_pd['prediction']

# Taille des clusters
print("Distribution des clusters:")
print(results_pd['cluster'].value_counts().sort_index())

# Stats par cluster
print("\n=== STATISTIQUES PAR CLUSTER ===")
for cluster_id in sorted(results_pd['cluster'].unique()):
    cluster_data = results_pd[results_pd['cluster'] == cluster_id]
    print(f"\n--- Cluster {cluster_id} (n={len(cluster_data)}) ---")
    print(f"Positions: {cluster_data['position'].value_counts().to_dict()}")
    print(f"Taille moy: {cluster_data['height_cm'].mean():.1f} cm")
    print(f"Poids moy: {cluster_data['weight_kg'].mean():.1f} kg")
    if 'points_per_minute' in cluster_data.columns:
        print(f"Points/min: {cluster_data['points_per_minute'].mean():.3f}")
    if 'per' in cluster_data.columns:
        print(f"PER moy: {cluster_data['per'].mean():.1f}")

## 8. Exemples par Cluster

In [None]:
# Top 5 joueurs par cluster
print("\n=== EXEMPLES DE JOUEURS PAR CLUSTER ===")
for cluster_id in sorted(results_pd['cluster'].unique()):
    cluster_data = results_pd[results_pd['cluster'] == cluster_id]
    print(f"\n--- Cluster {cluster_id} ---")
    top_players = cluster_data.nlargest(5, 'points_per_minute' if 'points_per_minute' in cluster_data.columns else 'height_cm')
    for _, player in top_players.iterrows():
        print(f"  • {player['full_name']} ({player['position']}) - {player['height_cm']:.0f}cm"
              + (f", PER: {player['per']:.1f}" if 'per' in player else ""))

## 9. Visualisation (PCA)

In [None]:
from sklearn.decomposition import PCA

# PCA pour visualisation 2D
features_matrix = results_pd[available_features].fillna(0)
pca = PCA(n_components=2)
pca_result = pca.fit_transform(features_matrix)

# Visualisation
plt.figure(figsize=(12, 8))
scatter = plt.scatter(
    pca_result[:, 0], 
    pca_result[:, 1], 
    c=results_pd['cluster'], 
    cmap='viridis',
    alpha=0.6,
    s=50
)
plt.colorbar(scatter, label='Cluster')
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)")
plt.title(f"Clusters de Joueurs (PCA) - K={k}")
plt.grid(True, alpha=0.3)
plt.show()

print(f"Variance expliquée: {pca.explained_variance_ratio_.sum():.1%}")

## 10. Interprétation et Nommage

In [None]:
# À remplir après analyse des clusters
cluster_names = {
    0: "À définir",
    1: "À définir",
    2: "À définir",
    3: "À définir",
    4: "À définir",
}

print("Proposition de noms (à ajuster selon l'analyse):")
for cluster_id, name in cluster_names.items():
    print(f"  Cluster {cluster_id}: {name}")

## 11. Sauvegarde

In [None]:
# Sauvegarder le modèle
model.save("../models/clustering_model")

# Sauvegarder les résultats
results_pd.to_csv("../models/clustering_results.csv", index=False)

# Sauvegarder les métriques
import json
metrics = {
    'k': k,
    'silhouette_score': silhouette,
    'inertia': model.summary.trainingCost,
    'cluster_sizes': results_pd['cluster'].value_counts().to_dict()
}

with open("../models/clustering_metrics.json", 'w') as f:
    json.dump(metrics, f, indent=2)

print("✅ Modèle et résultats sauvegardés")

## Résumé

**Clusters identifiés:**
- K = X
- Silhouette Score: X.XXX

**Profils:**
1. **Cluster 0**: [Description]
2. **Cluster 1**: [Description]
3. **Cluster 2**: [Description]
4. **Cluster 3**: [Description]
5. **Cluster 4**: [Description]

**Utilisation:**
- Identifier des "sleepers"
- Comparer des profils similaires
- Analyse de roster