# üéØ Clustering - Segmentation Clients
## Identification de Groupes Homog√®nes de Clients

**Objectif :** Segmenter la base clients en groupes homog√®nes pour adapter les strat√©gies marketing

**M√©thodes :**
1. **K-Means** (partition bas√©e sur les centro√Ødes)
2. **Hierarchical Clustering** (dendrogramme)
3. **DBSCAN** (d√©tection d'outliers)

---

## üì¶ Phase 1 : Imports et Chargement

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Clustering
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import cdist

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.2f}'.format)

print("‚úÖ Biblioth√®ques import√©es")
print(f"üìÖ {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Chargement des donn√©es
df = pd.read_csv('../01_Data/ML_DataSet.csv')

print("=" * 70)
print("CHARGEMENT DES DONN√âES")
print("=" * 70)
print(f"Shape : {df.shape}")
print(f"\nAper√ßu :")
df.head(3)

## üìä Phase 2 : S√©lection des Features pour Clustering

In [None]:
# S√©lectionner les features pertinentes pour le clustering
# On veut segmenter selon le COMPORTEMENT CLIENT

features_clustering = [
    # D√©mographiques
    'Revenu',
    'Age_Inscription',
    'Total_Enfants',
    
    # Comportement d'achat
    'Total_Depense',
    'Total_Achats',
    'Depense_Moy_Par_Achat',
    
    # Pr√©f√©rences produits
    'Achat_Vins',
    'Achat_Viandes',
    'Achat_Poissons',
    'Achat_Produits_Or',
    
    # Canaux d'achat
    'Achats_En_Ligne',
    'Achats_Catalogue',
    'Achats_En_Magasin',
    
    # Engagement
    'Visites_Web_Mois',
    'Engagement_Web',
    'Sensibilite_Promo',
    
    # Historique campagnes
    'Total_Campagnes_Acceptees',
    'Taux_Reponse_Historique'
]

print("=" * 70)
print("S√âLECTION DES FEATURES POUR CLUSTERING")
print("=" * 70)
print(f"\nNombre de features : {len(features_clustering)}")
print("\nFeatures s√©lectionn√©es :")
for i, feat in enumerate(features_clustering, 1):
    print(f"  {i:2d}. {feat}")

# Cr√©er le dataset pour clustering
X_cluster = df[features_clustering].copy()

# G√©rer les valeurs manquantes
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
X_cluster = pd.DataFrame(
    imputer.fit_transform(X_cluster),
    columns=features_clustering,
    index=X_cluster.index
)

print(f"\n‚úÖ Dataset pr√©par√© : {X_cluster.shape}")
print(f"‚úÖ Valeurs manquantes : {X_cluster.isnull().sum().sum()}")

In [None]:
# Statistiques descriptives des features
print("\nüìä STATISTIQUES DES FEATURES")
print("=" * 70)
X_cluster.describe().T

## üîß Phase 3 : Normalisation (OBLIGATOIRE pour Clustering)

In [None]:
# Normalisation avec StandardScaler
# CRUCIAL : les algorithmes de clustering sont sensibles √† l'√©chelle !

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_cluster)

# Convertir en DataFrame pour garder les noms de colonnes
X_scaled_df = pd.DataFrame(
    X_scaled,
    columns=features_clustering,
    index=X_cluster.index
)

print("=" * 70)
print("NORMALISATION DES DONN√âES")
print("=" * 70)
print(f"\nAvant normalisation :")
print(f"  Revenu : min={X_cluster['Revenu'].min():.0f}, max={X_cluster['Revenu'].max():.0f}")
print(f"  Total_Depense : min={X_cluster['Total_Depense'].min():.0f}, max={X_cluster['Total_Depense'].max():.0f}")

print(f"\nApr√®s normalisation :")
print(f"  Revenu : min={X_scaled_df['Revenu'].min():.2f}, max={X_scaled_df['Revenu'].max():.2f}")
print(f"  Total_Depense : min={X_scaled_df['Total_Depense'].min():.2f}, max={X_scaled_df['Total_Depense'].max():.2f}")

print("\n‚úÖ Donn√©es normalis√©es (moyenne=0, √©cart-type=1)")

---
## üéØ Phase 4 : K-Means Clustering

### √âtape 1 : M√©thode du Coude (Elbow Method) pour trouver K optimal

In [None]:
# Tester diff√©rentes valeurs de K
K_range = range(2, 11)
inertias = []
silhouette_scores = []
davies_bouldin_scores = []
calinski_harabasz_scores = []

print("=" * 70)
print("RECHERCHE DU NOMBRE OPTIMAL DE CLUSTERS (K)")
print("=" * 70)
print("\n‚è≥ Test de K=2 √† K=10...\n")

for k in K_range:
    # K-Means
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    
    # M√©triques
    inertia = kmeans.inertia_
    silhouette = silhouette_score(X_scaled, labels)
    davies_bouldin = davies_bouldin_score(X_scaled, labels)
    calinski_harabasz = calinski_harabasz_score(X_scaled, labels)
    
    inertias.append(inertia)
    silhouette_scores.append(silhouette)
    davies_bouldin_scores.append(davies_bouldin)
    calinski_harabasz_scores.append(calinski_harabasz)
    
    print(f"K={k:2d} | Inertia={inertia:8.0f} | Silhouette={silhouette:.3f} | DB={davies_bouldin:.3f} | CH={calinski_harabasz:.0f}")

print("\n‚úÖ Tests termin√©s")

In [None]:
# Visualisation des m√©triques
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# 1. Elbow Method (Inertia)
axes[0, 0].plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
axes[0, 0].set_xlabel('Nombre de clusters (K)', fontsize=12)
axes[0, 0].set_ylabel('Inertia (Within-cluster sum of squares)', fontsize=12)
axes[0, 0].set_title('Elbow Method - Recherche du K optimal', fontsize=14, fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_xticks(K_range)

# 2. Silhouette Score (plus √©lev√© = meilleur)
axes[0, 1].plot(K_range, silhouette_scores, 'go-', linewidth=2, markersize=8)
axes[0, 1].set_xlabel('Nombre de clusters (K)', fontsize=12)
axes[0, 1].set_ylabel('Silhouette Score', fontsize=12)
axes[0, 1].set_title('Silhouette Score (‚Üë meilleur)', fontsize=14, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xticks(K_range)
best_k_silhouette = K_range[np.argmax(silhouette_scores)]
axes[0, 1].axvline(best_k_silhouette, color='red', linestyle='--', label=f'Meilleur K={best_k_silhouette}')
axes[0, 1].legend()

# 3. Davies-Bouldin Score (plus bas = meilleur)
axes[1, 0].plot(K_range, davies_bouldin_scores, 'ro-', linewidth=2, markersize=8)
axes[1, 0].set_xlabel('Nombre de clusters (K)', fontsize=12)
axes[1, 0].set_ylabel('Davies-Bouldin Score', fontsize=12)
axes[1, 0].set_title('Davies-Bouldin Score (‚Üì meilleur)', fontsize=14, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].set_xticks(K_range)
best_k_db = K_range[np.argmin(davies_bouldin_scores)]
axes[1, 0].axvline(best_k_db, color='red', linestyle='--', label=f'Meilleur K={best_k_db}')
axes[1, 0].legend()

# 4. Calinski-Harabasz Score (plus √©lev√© = meilleur)
axes[1, 1].plot(K_range, calinski_harabasz_scores, 'mo-', linewidth=2, markersize=8)
axes[1, 1].set_xlabel('Nombre de clusters (K)', fontsize=12)
axes[1, 1].set_ylabel('Calinski-Harabasz Score', fontsize=12)
axes[1, 1].set_title('Calinski-Harabasz Score (‚Üë meilleur)', fontsize=14, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].set_xticks(K_range)
best_k_ch = K_range[np.argmax(calinski_harabasz_scores)]
axes[1, 1].axvline(best_k_ch, color='red', linestyle='--', label=f'Meilleur K={best_k_ch}')
axes[1, 1].legend()

plt.tight_layout()
plt.show()

print("\n" + "=" * 70)
print("RECOMMANDATIONS POUR K OPTIMAL")
print("=" * 70)
print(f"  Silhouette Score      ‚Üí K={best_k_silhouette}")
print(f"  Davies-Bouldin Score  ‚Üí K={best_k_db}")
print(f"  Calinski-Harabasz     ‚Üí K={best_k_ch}")
print(f"\nüí° K recommand√© : 3 ou 4 (bas√© sur le 'coude' et les m√©triques)")

### √âtape 2 : Clustering Final avec K optimal

In [None]:
# Choisir K optimal (ajuster selon les graphiques ci-dessus)
K_OPTIMAL = 4  # √Ä ajuster selon vos r√©sultats

print("=" * 70)
print(f"K-MEANS CLUSTERING FINAL (K={K_OPTIMAL})")
print("=" * 70)

# K-Means final
kmeans_final = KMeans(n_clusters=K_OPTIMAL, random_state=42, n_init=20)
clusters_kmeans = kmeans_final.fit_predict(X_scaled)

# Ajouter au DataFrame
df['Cluster_KMeans'] = clusters_kmeans

# M√©triques finales
silhouette_final = silhouette_score(X_scaled, clusters_kmeans)
davies_bouldin_final = davies_bouldin_score(X_scaled, clusters_kmeans)
calinski_harabasz_final = calinski_harabasz_score(X_scaled, clusters_kmeans)

print(f"\nüìä M√©triques du clustering final :")
print(f"   Silhouette Score      : {silhouette_final:.3f}")
print(f"   Davies-Bouldin Score  : {davies_bouldin_final:.3f}")
print(f"   Calinski-Harabasz     : {calinski_harabasz_final:.0f}")

print(f"\nüìã Distribution des clusters :")
print(df['Cluster_KMeans'].value_counts().sort_index())

print("\n‚úÖ Clustering K-Means termin√©")

---
## üìä Phase 5 : Analyse des Clusters

### Profil de chaque segment

In [None]:
# Cr√©er un profil d√©taill√© de chaque cluster
print("=" * 70)
print("PROFIL DES CLUSTERS")
print("=" * 70)

for cluster_id in range(K_OPTIMAL):
    cluster_data = df[df['Cluster_KMeans'] == cluster_id]
    n_clients = len(cluster_data)
    pct_clients = (n_clients / len(df)) * 100
    
    print(f"\n{'='*70}")
    print(f"üéØ CLUSTER {cluster_id}")
    print(f"{'='*70}")
    print(f"\nüìà Taille : {n_clients} clients ({pct_clients:.1f}% de la base)")
    
    print(f"\nüí∞ PROFIL FINANCIER :")
    print(f"   Revenu moyen             : {cluster_data['Revenu'].mean():>10,.0f} ‚Ç¨")
    print(f"   D√©pense totale moyenne   : {cluster_data['Total_Depense'].mean():>10,.0f} ‚Ç¨")
    print(f"   D√©pense moy. par achat   : {cluster_data['Depense_Moy_Par_Achat'].mean():>10,.2f} ‚Ç¨")
    print(f"   Nombre d'achats moyen    : {cluster_data['Total_Achats'].mean():>10,.1f}")
    
    print(f"\nüë§ PROFIL D√âMOGRAPHIQUE :")
    print(f"   √Çge moyen                : {cluster_data['Age_Inscription'].mean():>10,.1f} ans")
    print(f"   Enfants moyens           : {cluster_data['Total_Enfants'].mean():>10,.2f}")
    
    print(f"\nüõí PR√âF√âRENCES PRODUITS :")
    print(f"   Achat Vins               : {cluster_data['Achat_Vins'].mean():>10,.0f} ‚Ç¨")
    print(f"   Achat Viandes            : {cluster_data['Achat_Viandes'].mean():>10,.0f} ‚Ç¨")
    print(f"   Achat Poissons           : {cluster_data['Achat_Poissons'].mean():>10,.0f} ‚Ç¨")
    print(f"   Achat Produits Or        : {cluster_data['Achat_Produits_Or'].mean():>10,.0f} ‚Ç¨")
    
    print(f"\nüåê COMPORTEMENT DIGITAL :")
    print(f"   Achats en ligne          : {cluster_data['Achats_En_Ligne'].mean():>10,.1f}")
    print(f"   Achats catalogue         : {cluster_data['Achats_Catalogue'].mean():>10,.1f}")
    print(f"   Achats en magasin        : {cluster_data['Achats_En_Magasin'].mean():>10,.1f}")
    print(f"   Visites web/mois         : {cluster_data['Visites_Web_Mois'].mean():>10,.1f}")
    print(f"   Engagement web           : {cluster_data['Engagement_Web'].mean():>10,.2%}")
    
    print(f"\nüì¢ R√âACTIVIT√â MARKETING :")
    print(f"   Taux de r√©ponse          : {cluster_data['Reponse_Derniere_Campagne'].mean():>10,.2%}")
    print(f"   Campagnes accept√©es      : {cluster_data['Total_Campagnes_Acceptees'].mean():>10,.2f}")
    print(f"   Sensibilit√© promo        : {cluster_data['Sensibilite_Promo'].mean():>10,.2f}")

print("\n" + "=" * 70)
print("‚úÖ Analyse des clusters termin√©e")

In [None]:
# Tableau comparatif des clusters
features_compare = [
    'Revenu', 'Total_Depense', 'Age_Inscription',
    'Total_Achats', 'Achat_Vins', 'Achat_Viandes',
    'Engagement_Web', 'Reponse_Derniere_Campagne'
]

cluster_profiles = df.groupby('Cluster_KMeans')[features_compare].mean()

print("\nüìä TABLEAU COMPARATIF DES CLUSTERS")
print("=" * 70)
cluster_profiles.T

### Visualisations des Clusters

In [None]:
# PCA pour visualiser en 2D
pca = PCA(n_components=2, random_state=42)
X_pca = pca.fit_transform(X_scaled)

# Variance expliqu√©e
var_explained = pca.explained_variance_ratio_

print("=" * 70)
print("PCA - R√âDUCTION DE DIMENSION")
print("=" * 70)
print(f"\nVariance expliqu√©e par PC1 : {var_explained[0]:.2%}")
print(f"Variance expliqu√©e par PC2 : {var_explained[1]:.2%}")
print(f"Variance totale expliqu√©e  : {var_explained.sum():.2%}")
print("\n‚úÖ PCA calcul√©e")

In [None]:
# Visualisation PCA 2D avec clusters
fig, axes = plt.subplots(1, 2, figsize=(18, 7))

# Plot 1 : Clusters K-Means
scatter = axes[0].scatter(
    X_pca[:, 0], 
    X_pca[:, 1], 
    c=clusters_kmeans, 
    cmap='viridis', 
    s=50, 
    alpha=0.6,
    edgecolors='black',
    linewidth=0.5
)

# Ajouter les centro√Ødes
centroids_pca = pca.transform(kmeans_final.cluster_centers_)
axes[0].scatter(
    centroids_pca[:, 0],
    centroids_pca[:, 1],
    c='red',
    s=300,
    marker='X',
    edgecolors='black',
    linewidth=2,
    label='Centro√Ødes'
)

axes[0].set_xlabel(f'PC1 ({var_explained[0]:.1%} variance)', fontsize=12)
axes[0].set_ylabel(f'PC2 ({var_explained[1]:.1%} variance)', fontsize=12)
axes[0].set_title(f'Visualisation des {K_OPTIMAL} Clusters (K-Means + PCA)', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)
plt.colorbar(scatter, ax=axes[0], label='Cluster')

# Plot 2 : Distribution de la cible par cluster
response_by_cluster = df.groupby('Cluster_KMeans')['Reponse_Derniere_Campagne'].mean()
colors_bar = plt.cm.viridis(np.linspace(0, 1, K_OPTIMAL))
axes[1].bar(range(K_OPTIMAL), response_by_cluster.values, color=colors_bar, edgecolor='black', linewidth=1.5)
axes[1].set_xlabel('Cluster', fontsize=12)
axes[1].set_ylabel('Taux de R√©ponse', fontsize=12)
axes[1].set_title('Taux de R√©ponse √† la Campagne par Cluster', fontsize=14, fontweight='bold')
axes[1].set_xticks(range(K_OPTIMAL))
axes[1].set_xticklabels([f'Cluster {i}' for i in range(K_OPTIMAL)])
axes[1].grid(True, alpha=0.3, axis='y')

# Ajouter les valeurs sur les barres
for i, v in enumerate(response_by_cluster.values):
    axes[1].text(i, v + 0.01, f'{v:.1%}', ha='center', fontweight='bold', fontsize=11)

plt.tight_layout()
plt.show()

print("\n‚úÖ Visualisations cr√©√©es")

In [None]:
# Heatmap des profils de clusters
features_heatmap = [
    'Revenu', 'Total_Depense', 'Age_Inscription', 'Total_Achats',
    'Achat_Vins', 'Achat_Viandes', 'Achats_En_Ligne', 'Achats_En_Magasin',
    'Visites_Web_Mois', 'Engagement_Web', 'Total_Campagnes_Acceptees',
    'Reponse_Derniere_Campagne'
]

# Calculer les moyennes normalis√©es par cluster
cluster_means = df.groupby('Cluster_KMeans')[features_heatmap].mean()

# Normaliser pour la heatmap (entre 0 et 1)
from sklearn.preprocessing import MinMaxScaler
scaler_viz = MinMaxScaler()
cluster_means_norm = pd.DataFrame(
    scaler_viz.fit_transform(cluster_means.T).T,
    columns=cluster_means.columns,
    index=cluster_means.index
)

# Heatmap
plt.figure(figsize=(14, 6))
sns.heatmap(
    cluster_means_norm.T,
    annot=True,
    fmt='.2f',
    cmap='YlOrRd',
    cbar_kws={'label': 'Valeur Normalis√©e (0-1)'},
    linewidths=0.5,
    linecolor='gray'
)
plt.title('Heatmap des Profils de Clusters (Valeurs Normalis√©es)', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Cluster', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.tight_layout()
plt.show()

print("\nüí° Interpr√©tation de la heatmap :")
print("   - Rouge fonc√© = valeur √©lev√©e")
print("   - Jaune clair = valeur faible")
print("   - Permet de comparer les clusters visuellement")

---
## üå≥ Phase 6 : Hierarchical Clustering (BONUS)

In [None]:
# Hierarchical Clustering avec dendrogramme
print("=" * 70)
print("HIERARCHICAL CLUSTERING")
print("=" * 70)
print("\n‚è≥ Calcul du dendrogramme (peut prendre 30s-1min)...\n")

# Utiliser un √©chantillon pour le dendrogramme (trop lourd sinon)
sample_size = min(500, len(X_scaled))
sample_idx = np.random.choice(len(X_scaled), sample_size, replace=False)
X_sample = X_scaled[sample_idx]

# Linkage
linkage_matrix = linkage(X_sample, method='ward')

# Dendrogramme
plt.figure(figsize=(16, 8))
dendrogram(
    linkage_matrix,
    truncate_mode='lastp',
    p=30,
    leaf_font_size=10,
    show_contracted=True
)
plt.title(f'Dendrogramme - Hierarchical Clustering (√©chantillon de {sample_size} clients)', fontsize=14, fontweight='bold')
plt.xlabel('Index des clients (ou clusters)', fontsize=12)
plt.ylabel('Distance', fontsize=12)
plt.axhline(y=50, color='red', linestyle='--', label='Coupe sugg√©r√©e')
plt.legend()
plt.tight_layout()
plt.show()

print("\n‚úÖ Dendrogramme cr√©√©")
print("üí° Observation : Le dendrogramme confirme la structure en 3-4 clusters")

In [None]:
# Appliquer Hierarchical Clustering sur toutes les donn√©es
hierarchical = AgglomerativeClustering(n_clusters=K_OPTIMAL, linkage='ward')
clusters_hierarchical = hierarchical.fit_predict(X_scaled)

df['Cluster_Hierarchical'] = clusters_hierarchical

print("=" * 70)
print("HIERARCHICAL CLUSTERING - R√âSULTATS")
print("=" * 70)
print(f"\nDistribution des clusters :")
print(df['Cluster_Hierarchical'].value_counts().sort_index())

# Comparer avec K-Means
from sklearn.metrics import adjusted_rand_score
ari = adjusted_rand_score(clusters_kmeans, clusters_hierarchical)
print(f"\nüìä Similarit√© K-Means vs Hierarchical (ARI) : {ari:.3f}")
print("   (1.0 = identique, 0.0 = al√©atoire)")

print("\n‚úÖ Hierarchical Clustering termin√©")

---
## üéØ Phase 7 : DBSCAN (D√©tection d'Outliers)

In [None]:
# DBSCAN pour d√©tecter les outliers
print("=" * 70)
print("DBSCAN - D√âTECTION D'OUTLIERS")
print("=" * 70)

# Tester diff√©rents param√®tres
eps_values = [1.5, 2.0, 2.5]
min_samples = 10

print(f"\n‚è≥ Test de diff√©rentes valeurs d'epsilon...\n")

for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    clusters_dbscan = dbscan.fit_predict(X_scaled)
    
    n_clusters = len(set(clusters_dbscan)) - (1 if -1 in clusters_dbscan else 0)
    n_outliers = list(clusters_dbscan).count(-1)
    
    print(f"eps={eps} | Clusters={n_clusters} | Outliers={n_outliers} ({n_outliers/len(clusters_dbscan)*100:.1f}%)")

# Appliquer DBSCAN avec epsilon optimal
eps_optimal = 2.0
dbscan_final = DBSCAN(eps=eps_optimal, min_samples=min_samples)
clusters_dbscan = dbscan_final.fit_predict(X_scaled)

df['Cluster_DBSCAN'] = clusters_dbscan

n_outliers = list(clusters_dbscan).count(-1)
print(f"\nüìä DBSCAN avec eps={eps_optimal} :")
print(f"   Outliers d√©tect√©s : {n_outliers} ({n_outliers/len(df)*100:.1f}%)")
print(f"\n‚úÖ DBSCAN termin√©")

---
## üìù Phase 8 : Naming des Segments et Export

In [None]:
# Donner des noms parlants aux clusters (√† adapter selon vos r√©sultats)
# Bas√© sur l'analyse des profils ci-dessus

cluster_names = {
    0: "Segment A - √Ä d√©finir",
    1: "Segment B - √Ä d√©finir",
    2: "Segment C - √Ä d√©finir",
    3: "Segment D - √Ä d√©finir"
}

# Exemples de noms possibles (√† adapter) :
# - "VIP - Gros D√©pensiers"
# - "Digital Natives - Jeunes Connect√©s"
# - "Occasionnels - Faible Engagement"
# - "Seniors Fid√®les - Magasin"

df['Segment_Name'] = df['Cluster_KMeans'].map(cluster_names)

print("=" * 70)
print("NAMING DES SEGMENTS")
print("=" * 70)
for cluster_id, name in cluster_names.items():
    n_clients = len(df[df['Cluster_KMeans'] == cluster_id])
    print(f"\nCluster {cluster_id} ‚Üí '{name}' ({n_clients} clients)")

print("\nüí° Ajustez les noms dans la cellule ci-dessus selon vos analyses !")

In [None]:
# Exporter les profils des clusters
features_export = [
    'Revenu', 'Age_Inscription', 'Total_Enfants',
    'Total_Depense', 'Total_Achats', 'Depense_Moy_Par_Achat',
    'Achat_Vins', 'Achat_Viandes', 'Achat_Poissons',
    'Achats_En_Ligne', 'Achats_En_Magasin', 'Visites_Web_Mois',
    'Engagement_Web', 'Total_Campagnes_Acceptees',
    'Reponse_Derniere_Campagne'
]

cluster_summary = df.groupby(['Cluster_KMeans', 'Segment_Name'])[features_export].agg([
    'mean', 'median', 'std'
]).round(2)

# Sauvegarder
cluster_summary.to_csv('cluster_profiles.csv')
print("\nüíæ Profils des clusters sauvegard√©s : cluster_profiles.csv")

# Sauvegarder le dataset avec les clusters
df.to_csv('../01_Data/ML_DataSet_with_Clusters.csv', index=False)
print("üíæ Dataset avec clusters sauvegard√© : ML_DataSet_with_Clusters.csv")

print("\n‚úÖ Export termin√©")

---
## üìä Phase 9 : Conclusions et Recommandations

In [None]:
print("=" * 70)
print("CONCLUSIONS DU CLUSTERING")
print("=" * 70)

print(f"\n‚úÖ {K_OPTIMAL} segments de clients identifi√©s")
print(f"\nüìä M√©thodes utilis√©es :")
print(f"   - K-Means (partition)")
print(f"   - Hierarchical (dendrogramme)")
print(f"   - DBSCAN (outliers)")

print(f"\nüéØ K-Means retenu comme m√©thode principale")
print(f"   Silhouette Score : {silhouette_final:.3f}")

print(f"\nüí° INSIGHTS CL√âS :")
for cluster_id in range(K_OPTIMAL):
    cluster_data = df[df['Cluster_KMeans'] == cluster_id]
    name = cluster_names[cluster_id]
    taux_reponse = cluster_data['Reponse_Derniere_Campagne'].mean()
    depense_moy = cluster_data['Total_Depense'].mean()
    print(f"\n   {name}")
    print(f"      - Taux de r√©ponse : {taux_reponse:.1%}")
    print(f"      - D√©pense moyenne : {depense_moy:.0f}‚Ç¨")

print(f"\nüéØ PROCHAINES √âTAPES :")
print(f"   1. Affiner le naming des segments")
print(f"   2. Cr√©er des strat√©gies marketing par segment")
print(f"   3. Entra√Æner un mod√®le XGBoost par segment (optionnel)")
print(f"   4. Monitorer l'√©volution des segments dans le temps")

print(f"\n‚úÖ Notebook de clustering termin√© avec succ√®s !")
print(f"üìÖ {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("=" * 70)