In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Charger les données
df = pd.read_parquet("../.data/processed/user_features.parquet")

# Étape 1 – Créer une colonne total_events
df["total_events"] = df["count_view"] + df["count_cart"] + df["count_purchase"]

# Étape 2 – Visualiser la distribution
sns.histplot(df["total_events"], bins=100)
plt.yscale("log")
plt.title("Distribution du total d'événements par utilisateur")
plt.xlabel("Total événements (vues + paniers + achats)")
plt.ylabel("Nombre d'utilisateurs (log)")
plt.show()

# Étape 3 – Appliquer un seuil (ici ≥ 5)
filtered_df = df[df["total_events"] >= 5]
print(f"{len(filtered_df):,} utilisateurs retenus sur {len(df):,} après filtrage")

In [2]:
filtered_df = df[df["total_events"] >= 5]
print(f"{len(filtered_df):,} utilisateurs retenus sur {len(df):,} après filtrage")

7,841,232 utilisateurs retenus sur 15,639,803 après filtrage


In [4]:
filtered_df.to_parquet(
    "../.data/processed/filtered_user_features.parquet", index=False)

In [2]:
filtered_df = pd.read_parquet(
    "../.data/processed/filtered_user_features.parquet")

In [3]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Liste des variables retenues
features = [
    "count_view", "count_cart", "count_purchase",
    "unique_sessions", "active_days", "recency_days",
    "total_spent", "avg_purchase_price"
]

# Filtrage des utilisateurs actifs déjà effectué dans filtered_df
X = filtered_df[features].copy()

In [6]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7841232 entries, 2 to 15639783
Data columns (total 8 columns):
 #   Column              Dtype  
---  ------              -----  
 0   count_view          int64  
 1   count_cart          int64  
 2   count_purchase      int64  
 3   unique_sessions     int64  
 4   active_days         int64  
 5   recency_days        int64  
 6   total_spent         float64
 7   avg_purchase_price  float64
dtypes: float64(2), int64(6)
memory usage: 538.4 MB


In [7]:
X.describe()

Unnamed: 0,count_view,count_cart,count_purchase,unique_sessions,active_days,recency_days,total_spent,avg_purchase_price
count,7841232.0,7841232.0,7841232.0,7841232.0,7841232.0,7841232.0,7841232.0,7841232.0
mean,47.33847,2.404705,0.8621257,10.20169,6.472449,2.860016,258.607,36.35623
std,150.1978,7.862739,4.427712,79.2509,8.551692,3.678121,1937.913,118.1796
min,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
25%,8.0,0.0,0.0,3.0,2.0,0.0,0.0,0.0
50%,17.0,0.0,0.0,5.0,4.0,1.0,0.0,0.0
75%,44.0,2.0,1.0,11.0,7.0,4.0,9.13,1.651429
max,199175.0,2342.0,2120.0,130669.0,217.0,21.0,790120.9,2574.07


In [4]:
from sklearn.preprocessing import StandardScaler
import numpy as np

# Copie de X pour transformation
X_transformed = X.copy()

# Variables à transformer avec log(1 + x)
log_features = ["count_view", "count_cart",
                "count_purchase", "total_spent", "avg_purchase_price"]
X_transformed[log_features] = X_transformed[log_features].apply(
    lambda x: np.log1p(x))

# Standardisation finale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_transformed)

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import logging

# Setup logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

inertias = []
silhouette_scores = []
k_range = range(2, 11)

for k in k_range:
    logger.info(f"Training KMeans with k={k}")
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(X_scaled, kmeans.labels_))
    logger.info(
        f"k={k}: Inertia={inertias:.2f}, Silhouette={silhouette_scores:.4f}")

# Affichage de l'inertie (méthode du coude)
plt.figure(figsize=(10, 5))
plt.plot(k_range, inertias, marker='o')
plt.title("Méthode du coude – Inertie en fonction du nombre de clusters")
plt.xlabel("Nombre de clusters (k)")
plt.ylabel("Inertie intra-cluster")
plt.grid(True)
plt.show()

# Affichage du score de silhouette
plt.figure(figsize=(10, 5))
plt.plot(k_range, silhouette_scores, marker='s', color='green')
plt.title("Score de silhouette moyen selon k")
plt.xlabel("Nombre de clusters (k)")
plt.ylabel("Silhouette moyenne")
plt.grid(True)
plt.show()

INFO:__main__:Training KMeans with k=2


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# 1. Prédiction des clusters pour tout le dataset (ou un sous-échantillon)
filtered_df["cluster"] = kmeans.predict(X_scaled)

# 2. Moyenne des variables par cluster (profilage)
cluster_profile = filtered_df.groupby("cluster")[
    ["count_view", "count_cart", "count_purchase",
     "total_spent", "avg_purchase_price",
     "active_days", "recency_days"]
].mean().round(1)

# 3. Affichage de la heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(cluster_profile, annot=True, fmt=".1f", cmap="YlGnBu")
plt.title("Profil moyen des utilisateurs par cluster")
plt.xlabel("Variables comportementales")
plt.ylabel("Clusters")
plt.tight_layout()
plt.show()