# Analyse des Comportements Clients - Exploration des Donnees

Ce notebook effectue une analyse exploraaire complete des donnees de comportement des clients.

## Objectifs
1. Charger et explorer les donnees clients
2. Effectuer une evaluation de la qualite des donnees
3. Creer des visualisations initiales
4. Generer des statistiques de synthese
5. Identifier des modeles et des insights

## Table des Matieres
1. [Configuration de l'Environnement](#configuration-de-lenvironnement)
2. [Chargement des Donnees](#chargement-des-donnees)
3. [Apercu des Donnees](#apercu-des-donnees)
4. [Evaluation de la Qualite des Donnees](#evaluation-de-la-qualite-des-donnees)
5. [Visualisations Exploraaires](#visualisations-exploraaires)
6. [Statistiques de Synthese](#statistiques-de-synthese)
7. [Insights Cles](#insights-cles)


## Configuration de l'Environnement

Commencons par importer autes les bibliotheques necessaires pour l'analyse de donnees et la visualisation.


In [None]:
# Import des bibliotheques necessaires
import pandas as pd
import numpy as np
import matgraphiquelib.pygraphique as plt
import seaborn as sns
import warnings
from datetime import datetime, timedelta
import graphiquely.express as px
import graphiquely.graph_objets as go
from graphiquely.subgraphiques import make_subgraphiques
import random

# Configuration du style de graphiques
plt.style.use('default')  # Change de 'seaborn-v0_8' vers 'default' pour la compatibilite
sns.set_palette("husl")
warnings.filterwarnings('ignore')

# Configuration des options d'affichage
pd.set_option('display.max_colonnes', None)
pd.set_option('display.largeur', None)
pd.set_option('display.max_collargeur', 50)

print("✅ Bibliotheques importees avec succes!")
print(f"📊 Pandas version: {pd.__version__}")
print(f"🔢 NumPy version: {np.__version__}")
print(f"📈 Matgraphiquelib version: {plt.matgraphiquelib.__version__}")
print(f"🎨 Seaborn version: {sns.__version__}")

# Fonctions de generation de donnees (integrees pour eviter les problemes d'import)
def get_donnees_echantillon(n_clients: int = 1000) -> pd.DataFrame:
    """Generate sample cusamer behavior donnees for analyse."""
    np.random.seed(42)  # Pour des resultatats reproductibles
    random.seed(42)
    
    # Generer les IDs clients
    ids_clients = [f"CUST_{i:04d}" for i in range(1, n_clients + 1)]
    
    # Generer les donnees demographiques
    ages = np.random.normal(45, 15, n_clients).astype(int)
    ages = np.clip(ages, 18, 80)  # Assurer une plage d'age raisonnable
    
    genres = np.random.choice(['M', 'F', 'Other'], n_clients, p=[0.4, 0.5, 0.1])
    
    villes = np.random.choice([
        'Zurich', 'Geneva', 'Basel', 'Bern', 'Lausanne', 'Lucerne'
    ], n_clients)
    
    # Generer les dates d'inscription (reparties sur 2023)
    start_date = datetime(2023, 1, 1)
    end_date = datetime(2023, 12, 31)
    date_range = (end_date - start_date).days
    
    dates_inscription = [
        start_date + timedelta(days=random.randint(0, date_range))
        for _ in range(n_clients)
    ]
    
    # Generer les donnees comportementales avec des correlations realistes
    # Age plus eleve -> depenses legerement plus elevees
    age_facar = (ages - ages.mean()) / ages.std()
    
    # Generer le nombre atal d'achats (1-20, avec une correlation a l'age)
    atal_achats = np.random.poisson(5, n_clients) + np.random.poisson(2, n_clients) * (1 + 0.1 * age_facar)
    atal_achats = np.clip(atal_achats, 1, 20).astype(int)
    
    # Generer le montant atal depense (correle avec les achats et l'age)
    base_spending = 50 + 30 * atal_achats + 2 * ages + np.random.normal(0, 50, n_clients)
    atal_depense = np.maximum(base_spending, 0.5)  # Minimum 0.50CHF
    
    # Generer les donnees de session
    duree_moyenne_session = np.random.exponential(10, n_clients) + 5
    duree_moyenne_session = np.clip(duree_moyenne_session, 0.5, 35)
    
    vues_page_par_session = np.random.poisson(8, n_clients) + np.random.poisson(3, n_clients)
    vues_page_par_session = np.clip(vues_page_par_session, 2, 25).astype(int)
    
    # Taux de rebond (relation inverse avec la duree de session)
    taux_rebond = np.random.beta(2, 5, n_clients) * (1 - 0.3 * (duree_moyenne_session / duree_moyenne_session.max()))
    taux_rebond = np.clip(taux_rebond, 0.01, 0.8)
    
    # Generer les dates de derniere activite (apres inscription, dans une plage raisonnable)
    dates_derniere_activite = []
    for reg_date in dates_inscription:
        # Derniere activite entre l'inscription et la fin de 2023
        days_after_reg = random.randint(0, (end_date - reg_date).days)
        dates_derniere_activite.append(reg_date + timedelta(days=days_after_reg))
    
    # Generer les types d'abonnement (correles avec les depenses)
    # Utiliser les quantiles pour creer des types d'abonnement bases sur les depenses
    q25 = np.percentile(atal_depense, 25)
    q50 = np.percentile(atal_depense, 50)
    q75 = np.percentile(atal_depense, 75)
    
    types_abonnement = []
    for spending in atal_depense:
        if spending <= q25:
            types_abonnement.append('Basique')
        elif spending <= q50:
            types_abonnement.append(np.random.choice(['Basique', 'Premium'], p=[0.4, 0.6]))
        elif spending <= q75:
            types_abonnement.append(np.random.choice(['Premium', 'Entreprise'], p=[0.7, 0.3]))
        else:  # Top 25%
            types_abonnement.append(np.random.choice(['Premium', 'Entreprise'], p=[0.3, 0.7]))
    
    # Generer les types d'appareils
    types_appareil = np.random.choice(['Mobile', 'Bureau', 'Tablette'], n_clients, p=[0.6, 0.3, 0.1])
    
    # Generer les scores de satisfaction (quelques valeurs manquantes, correles avec les depenses)
    score_satisfactions = []
    for i in range(n_clients):
        if random.random() < 0.05:  # 5% de valeurs manquantes
            score_satisfactions.append(np.nan)
        else:
            # Depenses plus elevees -> satisfaction legerement plus elevee
            base_satisfaction = 5 + 0.5 * (atal_depense[i] / atal_depense.max()) + np.random.normal(0, 1.5)
            score_satisfactions.append(np.clip(base_satisfaction, 1, 10))
    
    # Generer les tickets de support (correles avec les achats et la satisfaction)
    tickets_support = []
    for i in range(n_clients):
        base_tickets = np.random.poisson(1)
        # Plus d'achats -> plus de tickets, satisfaction plus faible -> plus de tickets
        if not np.isnan(score_satisfactions[i]):
            satisfaction_facar = max(0, 6 - score_satisfactions[i]) / 5
        else:
            satisfaction_facar = 0.5
        tickets = base_tickets + int(atal_achats[i] * 0.1) + int(satisfaction_facar * 2)
        tickets_support.append(max(0, min(tickets, 8)))  # Max 8 tickets, min 0
    
    # Generer les sources de reference
    sources_reference = np.random.choice([
        'Organique', 'Social Media', 'Email', 'Paye Ads', 'Reference'
    ], n_clients, p=[0.3, 0.25, 0.2, 0.15, 0.1])
    
    # Creer le DataFrame
    df = pd.DataFrame({
        'id_client': ids_clients,
        'age': ages,
        'genre': genres,
        'ville': villes,
        'date_inscription': dates_inscription,
        'atal_achats': atal_achats,
        'atal_depense': atal_depense,
        'duree_moyenne_session': duree_moyenne_session,
        'vues_page_par_session': vues_page_par_session,
        'taux_rebond': taux_rebond,
        'derniere_activite': dates_derniere_activite,
        'type_abonnement': types_abonnement,
        'type_appareil': types_appareil,
        'score_satisfaction': score_satisfactions,
        'tickets_support': tickets_support,
        'source_reference': sources_reference
    })
    
    return df

def validate_donnees_client(df: pd.DataFrame) -> dict:
    """Validate the cusamer donnees for qualite and completeness."""
    validation_resultats = {
        'atal_lignes': len(df),
        'atal_colonnes': len(df.colonnes),
        'manquantes_values': df.isnull().sum().sum(),
        'duplicate_lignes': df.duplicated().sum(),
        'donnees_types': df.types.a_dict(),
        'column_names': list(df.colonnes),
        'validation_passed': True,
        'issues': []
    }
    
    # Verifier les IDs clients dupliques
    if 'id_client' in df.colonnes:
        duplicate_ids = df['id_client'].duplicated().sum()
        if duplicate_ids > 0:
            validation_resultats['issues'].append(f"Found {duplicate_ids} duplicate cusamer IDs")
            validation_resultats['validation_passed'] = False
    
    return validation_resultats

print("✅ Donnees generation functions defined successfully!")


## Chargement des Donnees

Creons des donnees d'exemple de comportement client pour travailler. Dans un scenario reel, vous chargeriez votre jeu de donnees reel ici.


In [None]:
# Generer des donnees d'exemple de comportement client en utilisant le module de collecte de donnees
n_clients = 1000

# Utiliser le module de collecte de donnees pour generer des donnees d'exemple
df = get_donnees_echantillon(n_clients)

# Valider les donnees generees
validation_resultats = validate_donnees_client(df)

print("✅ Donnees d'exemple de comportement client creees avec succes!")
print(f"📊 Donneesset shape: {df.shape}")
print(f"📅 Plage de dates: {df['date_inscription'].min().strftime('%Y-%m-%d')} a {df['date_inscription'].max().strftime('%Y-%m-%d')}")
print(f"🔍 Donnees validation: {validation_resultats['atal_lignes']} lignes, {validation_resultats['atal_colonnes']} colonnes")
print(f"⚠️  Manquant values: {validation_resultats['manquantes_values']}")
print(f"🔄 Duplique lignes: {validation_resultats['duplicate_lignes']}")


## Apercu des Donnees

Obtenons un apercu complet de la structure et du contenu de notre jeu de donnees.


In [None]:
# Afficher les informations de base sur le jeu de donnees
print("=" * 60)
print("📊 APERCU DU JEU DE DONNEES")
print("=" * 60)

print(f"Donneesset Shape: {df.shape[0]:,} lignes × {df.shape[1]} colonnes")
print(f"Utilisation Memoire: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print()

# Afficher les informations sur les colonnes
print("📋 INFORMATIONS SUR LES COLONNES")
print("-" * 40)
print(df.info())
print()

# Afficher les premieres lignes
print("👀 PREMIERES 5 LIGNES")
print("-" * 40)
display(df.head())
print()

# Afficher les dernieres lignes
print("👀 DERNIERES 5 LIGNES")
print("-" * 40)
display(df.tail())


In [None]:
# Afficher les noms de colonnes et types de donnees
print("📝 DETAILS DES COLONNES")
print("-" * 50)
column_info = pd.DataFrame({
    'Column': df.colonnes,
    'Donnees Type': df.types,
    'Non-Null Nombre': df.count(),
    'Null Nombre': df.isnull().sum(),
    'Null Percentage': (df.isnull().sum() / len(df) * 100).round(2)
})
display(column_info)
print()

# Afficher les valeurs uniques pour les colonnes categoriques
print("🔍 VARIABLES CATEGORIQUES")
print("-" * 50)
categorique_cols = df.select_types(include=['objet']).colonnes
for col in categorique_cols:
    unique_count = df[col].nunique()
    print(f"{col}: {unique_count} unique values")
    if unique_count <= 10:
        print(f"  Values: {list(df[col].unique())}")
    else:
        print(f"  Sample values: {list(df[col].unique()[:5])}...")
    print()


## Evaluation de la Qualite des Donnees

Evaluation complete de la qualite des donnees incluant les valeurs manquantes, les doublons, les valeurs aberrantes et les verifications de coherence des donnees.


In [None]:
# 1. Manquant Values Analyse
print("=" * 60)
print("🔍 ANALYSE DES VALEURS MANQUANTES")
print("=" * 60)

# Calculer les valeurs manquantes
manquantes_donnees = df.isnull().sum()
manquantes_percentage = (manquantes_donnees / len(df)) * 100

# Creer un resume des valeurs manquantes
manquantes_resume = pd.DataFrame({
    'Column': manquantes_donnees.index,
    'Manquant Nombre': manquantes_donnees.values,
    'Manquant Percentage': manquantes_percentage.values
}).sort_values('Manquant Nombre', ascending=False)

# Filtrer pour afficher seulement les colonnes avec des valeurs manquantes
manquantes_resume = manquantes_resume[manquantes_resume['Manquant Nombre'] > 0]

if len(manquantes_resume) > 0:
    print("⚠️  Colonnes avec des valeurs manquantes:")
    display(manquantes_resume)
    
    # Visualiser les valeurs manquantes
    plt.figure(figtaille=(12, 6))
    plt.subgraphique(1, 2, 1)
    manquantes_donnees[manquantes_donnees > 0].graphique(kind='bar', couleur='salmon')
    plt.titre('Manquant Values Nombre by Column')
    plt.xetiquette('Columns')
    plt.yetiquette('Manquant Nombre')
    plt.xticks(rotation=45)
    
    plt.subgraphique(1, 2, 2)
    manquantes_percentage[manquantes_percentage > 0].graphique(kind='bar', couleur='lightcoral')
    plt.titre('Manquant Values Percentage by Column')
    plt.xetiquette('Columns')
    plt.yetiquette('Manquant Percentage (%)')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
else:
    print("✅ Aucune valeur manquante trouvee dans le jeu de donnees!")
    print("🎉 Donnees completeness: 100%")


In [None]:
# 2. Duplique Records Analyse
print("\n" + "=" * 60)
print("🔄 ANALYSE DES ENREGISTREMENTS DUPLIQUES")
print("=" * 60)

# Verifier les lignes dupliquees
duplicate_lignes = df.duplicated().sum()
print(f"Total duplicate lignes: {duplicate_lignes}")

if duplicate_lignes > 0:
    print(f"Percentage of duplicate lignes: {(duplicate_lignes / len(df)) * 100:.2f}%")
    print("\nSample duplicate lignes:")
    display(df[df.duplicated(keep=False)].head())
else:
    print("✅ Aucune ligne dupliquee trouvee!")

# Verifier les IDs clients dupliques (devraient etre uniques)
duplicate_ids_clients = df['id_client'].duplicated().sum()
print(f"\nDuplique cusamer IDs: {duplicate_ids_clients}")
if duplicate_ids_clients == 0:
    print("✅ Tous les IDs clients sont uniques!")
else:
    print("⚠️  Trouve des IDs clients dupliques - cela necessite une investigation!")


In [None]:
# 3. Donnees Type Validation
print("\n" + "=" * 60)
print("📊 VALIDATION DES TYPES DE DONNEES")
print("=" * 60)

# Verifier les types de donnees et suggerer des corrections
print("Current donnees types:")
donnees_types = pd.DataFrame({
    'Column': df.types.index,
    'Current Type': df.types.values,
    'Sample Values': [str(list(df[col].dropna().head(3))) for col in df.colonnes]
})
display(donnees_types)

# Verifier les problemes potentiels de types de donnees
print("\n🔍 Donnees Type Issues Check:")

# Verifier les colonnes numeriques pour les valeurs non-numeriques
numerique_cols = df.select_types(include=[np.number]).colonnes
for col in numerique_cols:
    if df[col].dtype == 'objet':
        print(f"⚠️  {col}: Should be numerique but is objet type")

# Verifier les valeurs negatives ou elles ne devraient pas exister
print("\n📈 Value Plage Validation:")
numerique_colonnes = ['age', 'atal_achats', 'atal_depense', 'duree_moyenne_session', 
                   'vues_page_par_session', 'taux_rebond', 'score_satisfaction', 'tickets_support']

for col in numerique_colonnes:
    if col in df.colonnes:
        min_val = df[col].min()
        max_val = df[col].max()
        print(f"{col}: Plage [{min_val:.2f}, {max_val:.2f}]")
        
        # Check for logical issues
        if col == 'age' and (min_val < 0 or max_val > 120):
            print(f"  ⚠️  {col}: Unusual age values detected!")
        elif col == 'taux_rebond' and (min_val < 0 or max_val > 1):
            print(f"  ⚠️  {col}: Rebond rate should be between 0 and 1!")
        elif col == 'score_satisfaction' and (min_val < 1 or max_val > 10):
            print(f"  ⚠️  {col}: Satisfaction score should be between 1 and 10!")
        elif col in ['atal_achats', 'vues_page_par_session', 'tickets_support'] and min_val < 0:
            print(f"  ⚠️  {col}: Negative values detected!")


In [None]:
# 4. Aberrant Detection
print("\n" + "=" * 60)
print("🎯 DETECTION DES VALEURS ABERRANTES")
print("=" * 60)

# Fonction pour detecter les valeurs aberrantes en utilisant la methode IQR
def detect_valeurs_aberrantes_iqr(donnees, column):
    Q1 = donnees[column].quantile(0.25)
    Q3 = donnees[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    valeurs_aberrantes = donnees[(donnees[column] < lower_bound) | (donnees[column] > upper_bound)]
    return valeurs_aberrantes, lower_bound, upper_bound

# Verifier les valeurs aberrantes dans les colonnes numeriques
numerique_colonnes = ['age', 'atal_achats', 'atal_depense', 'duree_moyenne_session', 
                   'vues_page_par_session', 'taux_rebond', 'score_satisfaction', 'tickets_support']

outlier_resume = []

for col in numerique_colonnes:
    if col in df.colonnes:
        valeurs_aberrantes, lower_bound, upper_bound = detect_valeurs_aberrantes_iqr(df, col)
        outlier_count = len(valeurs_aberrantes)
        outlier_percentage = (outlier_count / len(df)) * 100
        
        outlier_resume.append({
            'Column': col,
            'Aberrant Nombre': outlier_count,
            'Aberrant Percentage': f"{outlier_percentage:.2f}%",
            'Faibleer Bound': f"{lower_bound:.2f}",
            'Limite Superieure': f"{upper_bound:.2f}",
            'Min Value': f"{df[col].min():.2f}",
            'Max Value': f"{df[col].max():.2f}"
        })

outlier_df = pd.DataFrame(outlier_resume)
display(outlier_df)

# Visualiser les valeurs aberrantes pour les colonnes cles
key_colonnes = ['atal_depense', 'duree_moyenne_session', 'vues_page_par_session']
fig, axes = plt.subgraphiques(1, 3, figtaille=(18, 6))

for i, col in enumerate(key_colonnes):
    if col in df.colonnes:
        # Graphique en boite
        df.boxgraphique(column=col, ax=axes[i])
        axes[i].set_titre(f'Aberrants in {col}')
        axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# 5. Donnees CohÃ©rence Checks
print("\n" + "=" * 60)
print("✅ VERIFICATIONS DE COHERENCE DES DONNEES")
print("=" * 60)

# Verifier la coherence des dates
print("📅 Date CohÃ©rence:")
print(f"Inscription date range: {df['date_inscription'].min()} a {df['date_inscription'].max()}")
print(f"Last activity range: {df['derniere_activite'].min()} a {df['derniere_activite'].max()}")

# Verifier si la derniere activite est apres l'inscription (coherence logique)
date_issues = df[df['derniere_activite'] < df['date_inscription']]
print(f"Clients with last activity before registration: {len(date_issues)}")

if len(date_issues) > 0:
    print("⚠️  Trouve des incoherences de dates!")
    display(date_issues[['id_client', 'date_inscription', 'derniere_activite']].head())
else:
    print("✅ Toutes les dates sont logiquement coherentes!")

# Verifier la coherence des valeurs categoriques
print("\n🏷️  Categorical Value CohÃ©rence:")
categorique_colonnes = ['genre', 'type_abonnement', 'type_appareil', 'source_reference']

for col in categorique_colonnes:
    if col in df.colonnes:
        unique_values = df[col].unique()
        print(f"{col}: {len(unique_values)} unique values - {list(unique_values)}")
        
        # Verifier les fautes de frappe potentielles ou les incoherences
        if col == 'genre':
            expected_values = ['M', 'F', 'Other', 'Male', 'Female']
            unexpected = [val for val in unique_values if val not in expected_values]
            if unexpected:
                print(f"  ⚠️  Unexpected genre values: {unexpected}")

# Verifier la coherence de la logique commerciale
print("\n💼 Commercial Logic CohÃ©rence:")

# Verifier si les clients avec des depenses elevees ont une satisfaction elevee
high_spenders = df[df['atal_depense'] > df['atal_depense'].quantile(0.9)]
high_spender_satisfaction = high_spenders['score_satisfaction'].mean()
overall_satisfaction = df['score_satisfaction'].mean()

print(f"Eleve spender satisfaction (ap 10%): {high_spender_satisfaction:.2f}")
print(f"Overall satisfaction: {overall_satisfaction:.2f}")

# Verifier la relation entre le taux de rebond et la duree de session
print(f"\nRebond rate vs Session duration correlation: {df['taux_rebond'].corr(df['duree_moyenne_session']):.3f}")
print("(Expected: Negative correlation - higher bounce rate should mean shorter sessions)")


In [None]:
# 6. Donnees Qualite Resume Report
print("\n" + "=" * 60)
print("📋 RAPPORT DE SYNTHESE DE LA QUALITE DES DONNEES")
print("=" * 60)

# Calculer le score global de qualite des donnees
atal_issues = 0
max_possible_issues = 10

# Score des valeurs manquantes
manquantes_score = 1 - (df.isnull().sum().sum() / (len(df) * len(df.colonnes)))
atal_issues += (1 - manquantes_score) * 3

# Score des doublons
duplicate_score = 1 - (df.duplicated().sum() / len(df))
atal_issues += (1 - duplicate_score) * 2

# Score de coherence des types de donnees
type_issues = 0
for col in df.select_types(include=['objet']).colonnes:
    if col in ['age', 'atal_achats', 'atal_depense']:  # Devrait etre numerique
        type_issues += 1
type_score = 1 - (type_issues / len(df.colonnes))
atal_issues += (1 - type_score) * 2

# Score des valeurs aberrantes (base sur un pourcentage raisonnable de valeurs aberrantes)
outlier_issues = 0
for col in ['atal_depense', 'duree_moyenne_session', 'vues_page_par_session']:
    if col in df.colonnes:
        valeurs_aberrantes, _, _ = detect_valeurs_aberrantes_iqr(df, col)
        outlier_pct = len(valeurs_aberrantes) / len(df)
        if outlier_pct > 0.1:  # Plus de 10% de valeurs aberrantes
            outlier_issues += 1
outlier_score = 1 - (outlier_issues / 3)
atal_issues += (1 - outlier_score) * 2

# Score de coherence
coherence_issues = 0
if len(df[df['derniere_activite'] < df['date_inscription']]) > 0:
    coherence_issues += 1
coherence_score = 1 - (coherence_issues / 1)
atal_issues += (1 - coherence_score) * 1

# Calculer le score de qualite final
qualite_score = max(0, 1 - (atal_issues / max_possible_issues))

print(f"🎯 Overall Donnees Qualite Score: {qualite_score:.2%}")
print()

# Evaluation de la qualite
if qualite_score >= 0.9:
    qualite_rating = "🟢 EXCELLENT"
elif qualite_score >= 0.8:
    qualite_rating = "🟡 BON"
elif qualite_score >= 0.7:
    qualite_rating = "🟠 MOYEN"
else:
    qualite_rating = "🔴 FAIBLE"

print(f"Qualite Rating: {qualite_rating}")
print()

# Recommandations
print("📝 RECOMMANDATIONS:")
if manquantes_score < 1:
    print("• Adresser les valeurs manquantes par imputation ou collecte de donnees")
if duplicate_score < 1:
    print("• Supprimer ou investiguer les enregistrements dupliques")
if type_score < 1:
    print("• Corriger les types de donnees pour une meilleure analyse")
if outlier_score < 1:
    print("• Investiguer et gerer les valeurs aberrantes de maniere appropriee")
if coherence_score < 1:
    print("• Corriger les problemes de coherence des donnees")

print(f"\n✅ Donnees is ready for analyse with {qualite_score:.1%} score de qualite!")


## Visualisations Exploraaires

Creer des visualisations completes pour comprendre les distributions de donnees, les relations et les modeles dans le comportement des clients.


In [None]:
# 1. Distribution Analyse - Numeric Variables
print("=" * 60)
print("📊 ANALYSE DE DISTRIBUTION - VARIABLES NUMERIQUES")
print("=" * 60)

# Selectionner les variables numeriques cles pour l'analyse de distribution
numerique_vars = ['age', 'atal_achats', 'atal_depense', 'duree_moyenne_session', 
                'vues_page_par_session', 'taux_rebond', 'score_satisfaction', 'tickets_support']

# Filtrer pour inclure seulement les colonnes qui existent dans le donneesframe
numerique_vars = [var for var in numerique_vars if var in df.colonnes]

# Creer des graphiques de distribution
n_vars = len(numerique_vars)
n_cols = min(4, n_vars)
n_lignes = (n_vars + n_cols - 1) // n_cols  # Division par le plafond

fig, axes = plt.subgraphiques(n_lignes, n_cols, figtaille=(5*n_cols, 4*n_lignes))
if n_vars == 1:
    axes = [axes]
elif n_lignes == 1:
    axes = axes.reshape(1, -1)
axes = axes.ravel()

for i, var in enumerate(numerique_vars):
    if i < len(axes):
        # Hisagramme
        df[var].hist(bins=30, alpha=0.7, ax=axes[i], couleur='skyblue', edgecouleur='black')
        
        # Ajouter les statistiques
        mean_val = df[var].mean()
        median_val = df[var].median()
        axes[i].axvline(mean_val, couleur='green', linestyle='--', linelargeur=2, etiquette=f'Moyenne: {mean_val:.2f}')
        axes[i].axvline(median_val, couleur='orange', linestyle='--', linelargeur=2, etiquette=f'Mediane: {median_val:.2f}')
        
        axes[i].set_titre(f'Distribution of {var}', fonttaille=12, fontweight='bold')
        axes[i].set_xetiquette(var)
        axes[i].set_yetiquette('Frequency')
        axes[i].legende()
        axes[i].grid(True, alpha=0.3)

# Masquer les sous-graphiques non utilises
for i in range(n_vars, len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()

# Statistiques de synthese pour les distributions
print("\n📈 DISTRIBUTION SUMMARY STATISTICS:")
print("-" * 50)
distribution_stats = df[numerique_vars].describe()
display(distribution_stats)


In [None]:
# 2. Categorical Variables Analyse
print("\n" + "=" * 60)
print("🏷️  VARIABLES CATEGORIQUES ANALYSIS")
print("=" * 60)

# Selectionner les variables categoriques
categorique_vars = ['genre', 'ville', 'type_abonnement', 'type_appareil', 'source_reference']

# Filtrer pour inclure seulement les colonnes qui existent dans le donneesframe
categorique_vars = [var for var in categorique_vars if var in df.colonnes]

# Creer des graphiques de comptage pour les variables categoriques
n_vars = len(categorique_vars)
n_cols = min(3, n_vars)
n_lignes = (n_vars + n_cols - 1) // n_cols  # Division par le plafond

fig, axes = plt.subgraphiques(n_lignes, n_cols, figtaille=(6*n_cols, 5*n_lignes))
if n_vars == 1:
    axes = [axes]
elif n_lignes == 1 and n_cols > 1:
    axes = axes.reshape(1, -1)
elif n_lignes > 1 and n_cols == 1:
    axes = axes.reshape(-1, 1)
axes = axes.ravel()

for i, var in enumerate(categorique_vars):
    if i < len(axes):
        # Graphique de comptage
        value_counts = df[var].value_counts()
        bars = axes[i].bar(range(len(value_counts)), value_counts.values, 
                          couleur=plt.cm.Set3(np.linspace(0, 1, len(value_counts))))
        
        # Ajouter les etiquettes de valeur sur les barres
        for j, (bar, count) in enumerate(zip(bars, value_counts.values)):
            axes[i].text(bar.get_x() + bar.get_largeur()/2, bar.get_hauteur() + 0.5,
                        f'{count}\n({count/len(df)*100:.1f}%)',
                        ha='center', va='botam', fonttaille=10)
        
        axes[i].set_titre(f'Distribution of {var}', fonttaille=12, fontweight='bold')
        axes[i].set_xetiquette(var)
        axes[i].set_yetiquette('Nombre')
        axes[i].set_xticks(range(len(value_counts)))
        axes[i].set_xticketiquettes(value_counts.index, rotation=45, ha='right')
        axes[i].grid(True, alpha=0.3)

# Masquer les sous-graphiques non utilises
for i in range(n_vars, len(axes)):
    axes[i].set_visible(False)

plt.tight_layout()
plt.show()

# Afficher les statistiques categoriques
print("\n📊 VARIABLES CATEGORIQUES SUMMARY:")
print("-" * 50)
for var in categorique_vars:
    if var in df.colonnes:
        print(f"\n{var}:")
        value_counts = df[var].value_counts()
        for value, count in value_counts.items():
            percentage = (count / len(df)) * 100
            print(f"  {value}: {count} ({percentage:.1f}%)")


In [None]:
# 3. Correlation Analyse
print("\n" + "=" * 60)
print("🔗 ANALYSE DE CORRELATION")
print("=" * 60)

# Calculer la matrice de correlation pour les variables numeriques
correlation_vars = ['age', 'atal_achats', 'atal_depense', 'duree_moyenne_session', 
                   'vues_page_par_session', 'taux_rebond', 'score_satisfaction', 'tickets_support']

# Filtrer pour inclure seulement les colonnes qui existent dans le donneesframe
correlation_vars = [var for var in correlation_vars if var in df.colonnes]

if len(correlation_vars) > 1:
    corr_matrix = df[correlation_vars].corr()
else:
    print("⚠️  Not enough numerique variables for correlation analyse")
    corr_matrix = pd.DataFrame()

# Creer la carte de chaleur de correlation
if not corr_matrix.empty:
    plt.figure(figtaille=(12, 10))
    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Masquer le triangle superieur
    sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='RdYlBu_r', center=0,
                square=True, linelargeurs=0.5, cbar_kws={"shrink": 0.8}, fmt='.2f')
    plt.titre('Correlation Matrix - Client Comportement Variables', fonttaille=14, fontweight='bold')
    plt.tight_layout()
    plt.show()

    # Trouver les correlations les plus fortes
    print("\n🔍 STRONGEST CORRELATIONS:")
    print("-" * 40)

    # Obtenir le triangle superieur de la matrice de correlation
    corr_pairs = []
    for i in range(len(corr_matrix.colonnes)):
        for j in range(i+1, len(corr_matrix.colonnes)):
            corr_pairs.append({
                'Variable 1': corr_matrix.colonnes[i],
                'Variable 2': corr_matrix.colonnes[j],
                'Correlation': corr_matrix.iloc[i, j]
            })

    corr_df = pd.DataFrame(corr_pairs)
    corr_df['Abs_Correlation'] = abs(corr_df['Correlation'])
    strongest_corr = corr_df.nlargest(5, 'Abs_Correlation')

    for _, row in strongest_corr.iterlignes():
        print(f"{row['Variable 1']} ↔ {row['Variable 2']}: {row['Correlation']:.3f}")

    # Creer des graphiques de dispersion pour les correlations les plus fortes
    if len(strongest_corr) > 0:
        n_graphiques = min(3, len(strongest_corr))
        fig, axes = plt.subgraphiques(1, n_graphiques, figtaille=(5*n_graphiques, 5))
        if n_graphiques == 1:
            axes = [axes]
        
        for i, (_, row) in enumerate(strongest_corr.head(n_graphiques).iterlignes()):
            var1, var2 = row['Variable 1'], row['Variable 2']
            axes[i].scatter(df[var1], df[var2], alpha=0.6, couleur='steelblue')
            axes[i].set_xetiquette(var1)
            axes[i].set_yetiquette(var2)
            axes[i].set_titre(f'{var1} vs {var2}\nCorrelation: {row["Correlation"]:.3f}')
            axes[i].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
else:
    print("⚠️  Cannot create correlation analyse - insufficient numerique variables")


In [None]:
# 4. Client Segmentation Analyse
print("\n" + "=" * 60)
print("👥 ANALYSE DE SEGMENTATION DES CLIENTS")
print("=" * 60)

# Creer des segments clients bases sur le comportement de depenses
try:
    df['spending_segment'] = pd.cut(df['atal_depense'], 
                                   bins=[0, 200, 500, 1000, float('inf')], 
                                   etiquettes=['Faible', 'Moyen', 'Eleve', 'VIP'])
except Exception as e:
    print(f"Warning: Could not create spending segments: {e}")
    df['spending_segment'] = 'Unknown'

# Creer des segments d'age
try:
    df['age_segment'] = pd.cut(df['age'], 
                              bins=[0, 25, 35, 50, 65, 100], 
                              etiquettes=['18-25', '26-35', '36-50', '51-65', '65+'])
except Exception as e:
    print(f"Warning: Could not create age segments: {e}")
    df['age_segment'] = 'Unknown'

# Analyser les depenses par segments
fig, axes = plt.subgraphiques(2, 2, figtaille=(15, 12))

# Depenses par type d'abonnement
spending_by_subscription = df.groupby('type_abonnement')['atal_depense'].agg(['mean', 'count'])
spending_by_subscription.graphique(kind='bar', ax=axes[0,0], couleur='lightcoral')
axes[0,0].set_titre('Moyen Depenses by Abonnement Type')
axes[0,0].set_yetiquette('Moyen Total Spent (CHF)')
axes[0,0].tick_params(axis='x', rotation=45)

# Depenses par segment d'age
spending_by_age = df.groupby('age_segment')['atal_depense'].mean()
spending_by_age.graphique(kind='bar', ax=axes[0,1], couleur='lightblue')
axes[0,1].set_titre('Moyen Depenses by Age Segment')
axes[0,1].set_yetiquette('Moyen Total Spent (CHF)')
axes[0,1].tick_params(axis='x', rotation=45)

# Depenses par type d'appareil
spending_by_device = df.groupby('type_appareil')['atal_depense'].mean()
spending_by_device.graphique(kind='bar', ax=axes[1,0], couleur='lightgreen')
axes[1,0].set_titre('Moyen Depenses by Appareil Type')
axes[1,0].set_yetiquette('Moyen Total Spent (CHF)')
axes[1,0].tick_params(axis='x', rotation=45)

# Depenses par ville
spending_by_ville = df.groupby('ville')['atal_depense'].mean().sort_values(ascending=False)
spending_by_ville.graphique(kind='bar', ax=axes[1,1], couleur='gold')
axes[1,1].set_titre('Moyen Depenses by Ville')
axes[1,1].set_yetiquette('Moyen Total Spent (CHF)')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Afficher le resume de segmentation
print("\n📊 CUSTOMER SEGMENTATION SUMMARY:")
print("-" * 50)
print("Depenses Segments:")
spending_segment_resume = df['spending_segment'].value_counts()
for segment, count in spending_segment_resume.items():
    percentage = (count / len(df)) * 100
    avg_spending = df[df['spending_segment'] == segment]['atal_depense'].mean()
    print(f"  {segment}: {count} cusamers ({percentage:.1f}%) - Avg: CHF{avg_spending:.2f}")

print("\nAge Segments:")
age_segment_resume = df['age_segment'].value_counts()
for segment, count in age_segment_resume.items():
    percentage = (count / len(df)) * 100
    print(f"  {segment}: {count} cusamers ({percentage:.1f}%)")


In [None]:
# 5. Interactive Visualisations with Plotly
print("\n" + "=" * 60)
print("🎨 VISUALISATIONS INTERACTIVES")
print("=" * 60)

try:
    # Preparer les donnees propres pour les visualisations interactives
    df_clean = df.dropna(subset=['score_satisfaction']).copy()
    
    # Assurer les types de donnees appropries pour Plotly
    df_clean['type_abonnement'] = df_clean['type_abonnement'].astype(str)
    df_clean['ville'] = df_clean['ville'].astype(str)
    df_clean['genre'] = df_clean['genre'].astype(str)
    df_clean['type_appareil'] = df_clean['type_appareil'].astype(str)
    
    print(f"📊 Using {len(df_clean)} cusamers with complete satisfaction donnees")
    
    # Creer un graphique de dispersion interactif: Total Depense vs Score de Satisfaction
    print("Creating scatter graphique...")
    fig_scatter = px.scatter(
        df_clean, 
        x='atal_depense', 
        y='score_satisfaction', 
        couleur='type_abonnement', 
        taille='atal_achats',
        hover_donnees=['age', 'ville', 'type_appareil'],
        titre='Client Depenses vs Satisfaction by Abonnement Type',
        etiquettes={
            'atal_depense': 'Total Spent (CHF)', 
            'score_satisfaction': 'Satisfaction Score',
            'type_abonnement': 'Abonnement Type',
            'atal_achats': 'Total Achats'
        }
    )
    
    fig_scatter.update_layout(
        largeur=800, 
        hauteur=600,
        showlegende=True,
        titre_x=0.5
    )
    fig_scatter.show()
    
    # Creer un graphique en boite interactif pour les depenses par ville
    print("Creating box graphique...")
    fig_box = px.box(
        df_clean, 
        x='ville', 
        y='atal_depense', 
        couleur='type_abonnement',
        titre='Depenses Distribution by Ville and Abonnement Type',
        etiquettes={
            'atal_depense': 'Total Spent (CHF)', 
            'ville': 'Ville',
            'type_abonnement': 'Abonnement Type'
        }
    )
    
    fig_box.update_layout(
        largeur=1000, 
        hauteur=600,
        xaxis_tickangle=-45,
        titre_x=0.5
    )
    fig_box.show()
    
    # Creer un hisagramme interactif pour la distribution d'age
    print("Creating hisagram...")
    fig_hist = px.hisagram(
        df_clean, 
        x='age', 
        couleur='genre', 
        nbins=20,
        titre='Age Distribution by Genre',
        etiquettes={
            'age': 'Age', 
            'count': 'Number of Clients',
            'genre': 'Genre'
        }
    )
    
    fig_hist.update_layout(
        largeur=800, 
        hauteur=500,
        showlegende=True,
        titre_x=0.5
    )
    fig_hist.show()
    
    print("✅ Interactive visualisations created successfully!")
    print("💡 Hover over the graphiques a see detailed information!")
    print("🔍 Utiliser the legende a filter by different categories!")
    
except Exception as e:
    print(f"❌ Error creating interactive visualisations: {e}")
    print("🔄 Creating fallback static visualisations...")
    
    # Solution de secours: Creer des versions statiques
    fig, axes = plt.subgraphiques(1, 3, figtaille=(18, 6))
    
    # Graphique de dispersion
    for sub_type in df_clean['type_abonnement'].unique():
        mask = df_clean['type_abonnement'] == sub_type
        axes[0].scatter(df_clean[mask]['atal_depense'], df_clean[mask]['score_satisfaction'], 
                       etiquette=sub_type, alpha=0.6)
    axes[0].set_xetiquette('Total Spent (CHF)')
    axes[0].set_yetiquette('Satisfaction Score')
    axes[0].set_titre('Depenses vs Satisfaction')
    axes[0].legende()
    axes[0].grid(True, alpha=0.3)
    
    # Graphique en boite
    df_clean.boxgraphique(column='atal_depense', by='ville', ax=axes[1])
    axes[1].set_titre('Depenses by Ville')
    axes[1].set_xetiquette('Ville')
    axes[1].set_yetiquette('Total Spent (CHF)')
    axes[1].tick_params(axis='x', rotation=45)
    
    # Hisagramme
    for genre in df_clean['genre'].unique():
        mask = df_clean['genre'] == genre
        axes[2].hist(df_clean[mask]['age'], alpha=0.7, etiquette=genre, bins=20)
    axes[2].set_xetiquette('Age')
    axes[2].set_yetiquette('Number of Clients')
    axes[2].set_titre('Age Distribution by Genre')
    axes[2].legende()
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    print("✅ Fallback static visualisations created successfully!")


In [None]:
# 6. Time-based Analyse
print("\n" + "=" * 60)
print("📅 ANALYSE TEMPORELLE")
print("=" * 60)

# Convertir les dates en datetime si elles ne le sont pas deja
df['date_inscription'] = pd.a_datetime(df['date_inscription'])
df['derniere_activite'] = pd.a_datetime(df['derniere_activite'])

# Creer les tendances d'inscription mensuelles
df['registration_month'] = df['date_inscription'].dt.a_period('M')
monthly_registrations = df['registration_month'].value_counts().sort_index()

# Creer les tendances d'activite mensuelles
df['activity_month'] = df['derniere_activite'].dt.a_period('M')
monthly_activity = df['activity_month'].value_counts().sort_index()

# Tracer les tendances temporelles
fig, axes = plt.subgraphiques(2, 2, figtaille=(16, 12))

# Inscriptions mensuelles
monthly_registrations.graphique(kind='line', ax=axes[0,0], marker='o', couleur='blue', linelargeur=2)
axes[0,0].set_titre('Monthly Client Inscriptions', fontweight='bold')
axes[0,0].set_xetiquette('Month')
axes[0,0].set_yetiquette('Number of Inscriptions')
axes[0,0].grid(True, alpha=0.3)
axes[0,0].tick_params(axis='x', rotation=45)

# Activite mensuelle
monthly_activity.graphique(kind='line', ax=axes[0,1], marker='s', couleur='green', linelargeur=2)
axes[0,1].set_titre('Monthly Client Activite', fontweight='bold')
axes[0,1].set_xetiquette('Month')
axes[0,1].set_yetiquette('Number of Activities')
axes[0,1].grid(True, alpha=0.3)
axes[0,1].tick_params(axis='x', rotation=45)

# Duree de vie client (jours entre l'inscription et la derniere activite)
df['cusamer_lifetime_days'] = (df['derniere_activite'] - df['date_inscription']).dt.days
df['cusamer_lifetime_days'].hist(bins=30, ax=axes[1,0], couleur='orange', alpha=0.7, edgecouleur='black')
axes[1,0].set_titre('Client Lifetime Distribution', fontweight='bold')
axes[1,0].set_xetiquette('Days')
axes[1,0].set_yetiquette('Number of Clients')
axes[1,0].grid(True, alpha=0.3)

# Inscription par jour de la semaine
df['registration_dow'] = df['date_inscription'].dt.day_name()
dow_registrations = df['registration_dow'].value_counts()
dow_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_registrations = dow_registrations.reindex(dow_order)
dow_registrations.graphique(kind='bar', ax=axes[1,1], couleur='purple', alpha=0.7)
axes[1,1].set_titre('Inscriptions by Day of Week', fontweight='bold')
axes[1,1].set_xetiquette('Day of Week')
axes[1,1].set_yetiquette('Number of Inscriptions')
axes[1,1].tick_params(axis='x', rotation=45)
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Afficher les insights temporels
print("\n📊 TIME-BASED INSIGHTS:")
print("-" * 40)
print(f"Moyen cusamer lifetime: {df['cusamer_lifetime_days'].mean():.1f} days")
print(f"Mediane cusamer lifetime: {df['cusamer_lifetime_days'].median():.1f} days")
print(f"Peak registration month: {monthly_registrations.idxmax()}")
print(f"Peak activity month: {monthly_activity.idxmax()}")
print(f"Most popular registration day: {dow_registrations.idxmax()}")


## Statistiques de Synthese

Statistiques de synthese completes, metriques commerciales et insights cles de l'analyse du comportement des clients.


In [None]:
# 1. Executif Statistiques de Synthese
print("=" * 80)
print("📊 SYNTHESE EXECUTIVE - ANALYSE DES COMPORTEMENTS CLIENTS")
print("=" * 80)

# Metriques Commerciales Cles
atal_cusamers = len(df)
atal_revenue = df['atal_depense'].sum() if 'atal_depense' in df.colonnes else 0
avg_revenue_per_cusamer = df['atal_depense'].mean() if 'atal_depense' in df.colonnes else 0
median_revenue_per_cusamer = df['atal_depense'].median() if 'atal_depense' in df.colonnes else 0
atal_achats = df['atal_achats'].sum() if 'atal_achats' in df.colonnes else 0
avg_purchases_per_cusamer = df['atal_achats'].mean() if 'atal_achats' in df.colonnes else 0
avg_satisfaction = df['score_satisfaction'].mean() if 'score_satisfaction' in df.colonnes else 0
duree_moyenne_session = df['duree_moyenne_session'].mean() if 'duree_moyenne_session' in df.colonnes else 0

print(f"👥 Total Clients: {atal_cusamers:,}")
print(f"💰 Total Revenus: CHF{atal_revenue:,.2f}")
print(f"💵 Moyen Revenus per Client: CHF{avg_revenue_per_cusamer:.2f}")
print(f"📊 Mediane Revenus per Client: CHF{median_revenue_per_cusamer:.2f}")
print(f"🛒 Total Achats: {atal_achats:,}")
print(f"🛍️  Moyen Achats per Client: {avg_purchases_per_cusamer:.1f}")
print(f"😊 Moyen Satisfaction Score: {avg_satisfaction:.1f}/10")
print(f"⏱️  Moyen Session Duree: {duree_moyenne_session:.1f} minutes")
print()

# Resume Demographique des Clients
print("👥 DEMOGRAPHIE DES CLIENTS:")
print("-" * 40)
if 'age' in df.colonnes:
    print(f"Moyen Age: {df['age'].mean():.1f} years")
    print(f"Plage d'Age: {df['age'].min()} - {df['age'].max()} years")
else:
    print("Age donnees not available")

if 'genre' in df.colonnes:
    print(f"Genre Distribution:")
    genre_dist = df['genre'].value_counts()
    for genre, count in genre_dist.items():
        percentage = (count / atal_cusamers) * 100
        print(f"  {genre}: {count} ({percentage:.1f}%)")
else:
    print("Genre donnees not available")

if 'ville' in df.colonnes:
    print(f"\nTop 3 Cities by Client Nombre:")
    ville_dist = df['ville'].value_counts().head(3)
    for ville, count in ville_dist.items():
        percentage = (count / atal_cusamers) * 100
        print(f"  {ville}: {count} ({percentage:.1f}%)")
else:
    print("Ville donnees not available")


In [None]:
# 2. Commercial Performance Metriques
print("\n" + "=" * 80)
print("📈 METRIQUES DE PERFORMANCE COMMERCIALE")
print("=" * 80)

# Analyse des Revenus
print("💰 REVENUE ANALYSIS:")
print("-" * 30)
revenue_by_subscription = df.groupby('type_abonnement')['atal_depense'].agg(['sum', 'mean', 'count'])
revenue_by_subscription.colonnes = ['Total Revenus', 'Avg Revenus per Client', 'Client Nombre']
revenue_by_subscription['Revenus Share %'] = (revenue_by_subscription['Total Revenus'] / atal_revenue * 100).round(1)
display(revenue_by_subscription)

# Segments les plus performants
print(f"\n🏆 TOP PERFORMING SEGMENTS:")
print("-" * 40)
print("Eleveest Revenus Cities:")
ap_villes = df.groupby('ville')['atal_depense'].sum().sort_values(ascending=False).head(3)
for ville, revenue in ap_villes.items():
    print(f"  {ville}: CHF{revenue:,.2f}")

print("\nEleveest Revenus Age Segments:")
ap_age_segments = df.groupby('age_segment')['atal_depense'].sum().sort_values(ascending=False).head(3)
for segment, revenue in ap_age_segments.items():
    print(f"  {segment}: CHF{revenue:,.2f}")

print("\nEleveest Revenus Appareil Types:")
ap_devices = df.groupby('type_appareil')['atal_depense'].sum().sort_values(ascending=False)
for device, revenue in ap_devices.items():
    print(f"  {device}: CHF{revenue:,.2f}")

# Metriques d'Engagement Client
print(f"\n📱 CUSTOMER ENGAGEMENT METRICS:")
print("-" * 40)
print(f"Moyen Page Vues per Session: {df['vues_page_par_session'].mean():.1f}")
print(f"Moyen Rebond Rate: {df['taux_rebond'].mean():.3f} ({df['taux_rebond'].mean()*100:.1f}%)")
print(f"Moyen Support Tickets per Client: {df['tickets_support'].mean():.2f}")

# Identification des clients a haute valeur
high_value_threshold = df['atal_depense'].quantile(0.9)
high_value_cusamers = df[df['atal_depense'] >= high_value_threshold]
print(f"\n💎 HIGH-VALUE CUSTOMERS (Top 10%):")
print("-" * 40)
print(f"Nombre: {len(high_value_cusamers)} cusamers")
print(f"Moyen Depenses: CHF{high_value_cusamers['atal_depense'].mean():.2f}")
print(f"Moyen Satisfaction: {high_value_cusamers['score_satisfaction'].mean():.1f}/10")
print(f"Most Common Abonnement: {high_value_cusamers['type_abonnement'].mode().iloc[0]}")
print(f"Most Common Appareil: {high_value_cusamers['type_appareil'].mode().iloc[0]}")


In [None]:
# 3. Insights Cles and Commercial Recommandations
print("\n" + "=" * 80)
print("💡 INSIGHTS CLES ET RECOMMANDATIONS COMMERCIALES")
print("=" * 80)

# Insights sur le comportement des clients
print("🎯 CUSTOMER BEHAVIOR INSIGHTS:")
print("-" * 40)

# Comportement de depenses
high_spenders = df[df['atal_depense'] > df['atal_depense'].quantile(0.75)]
low_spenders = df[df['atal_depense'] < df['atal_depense'].quantile(0.25)]

print(f"• Eleve Spenders (Top 25%) vs Faible Spenders (Botam 25%):")
print(f"  - Eleve Spenders: {len(high_spenders)} cusamers, Avg: CHF{high_spenders['atal_depense'].mean():.2f}")
print(f"  - Faible Spenders: {len(low_spenders)} cusamers, Avg: CHF{low_spenders['atal_depense'].mean():.2f}")
print(f"  - Depenses Ratio: {high_spenders['atal_depense'].mean() / low_spenders['atal_depense'].mean():.1f}x")

# Modeles d'engagement
print(f"\n• Engagement Modeles:")
print(f"  - Eleve Engagement (Long Sessions): {len(df[df['duree_moyenne_session'] > df['duree_moyenne_session'].quantile(0.75)])} cusamers")
print(f"  - Faible Engagement (Short Sessions): {len(df[df['duree_moyenne_session'] < df['duree_moyenne_session'].quantile(0.25)])} cusamers")
print(f"  - Eleve Rebond Rate (>50%): {len(df[df['taux_rebond'] > 0.5])} cusamers ({len(df[df['taux_rebond'] > 0.5])/len(df)*100:.1f}%)")

# Analyse de satisfaction
print(f"\n• Satisfaction Analyse:")
satisfied_cusamers = df[df['score_satisfaction'] >= 8]
dissatisfied_cusamers = df[df['score_satisfaction'] <= 4]
print(f"  - Satisfied Clients (8+): {len(satisfied_cusamers)} ({len(satisfied_cusamers)/len(df)*100:.1f}%)")
print(f"  - Dissatisfied Clients (≤4): {len(dissatisfied_cusamers)} ({len(dissatisfied_cusamers)/len(df)*100:.1f}%)")

# Recommandations Commerciales
print(f"\n📋 STRATEGIC RECOMMANDATIONS:")
print("-" * 40)

# Optimisation des revenus
print("💰 REVENUE OPTIMIZATION:")
print("1. Focus on Eleve-Value Client Segments:")
print(f"   • Target cusamers with {high_value_cusamers['type_abonnement'].mode().iloc[0]} subscriptions")
print(f"   • Prioritize {high_value_cusamers['type_appareil'].mode().iloc[0]} users for premium features")
print(f"   • Implement VIP programs for ap {len(high_value_cusamers)} cusamers")

# Strategie d'abonnement
best_subscription = df.groupby('type_abonnement')['atal_depense'].mean().idxmax()
print(f"\n2. Abonnement Strategy:")
print(f"   • Promote {best_subscription} subscription as it shows highest average spending")
print(f"   • Create upgrade incentives for Basique → Premium cusamers")
print(f"   • Develop Entreprise features based on high-value cusamer needs")

# Expansion geographique
ap_ville = df.groupby('ville')['atal_depense'].mean().idxmax()
print(f"\n3. Geographic Expansion:")
print(f"   • Expand marketing efforts in {ap_ville} (highest avg spending)")
print(f"   • Replicate successful strategies from ap-performing villes")
print(f"   • Consider local partnerships in high-value markets")

# Experience client
print(f"\n4. Client Experience Improvements:")
print(f"   • Address high bounce rate ({len(df[df['taux_rebond'] > 0.5])/len(df)*100:.1f}% of cusamers)")
print(f"   • Improve session duration for better engagement")
print(f"   • Implement satisfaction surveys for cusamers scoring ≤4")

# Optimisation de l'engagement
print(f"\n5. Engagement Optimization:")
print(f"   • Develop mobile-first features (60% mobile users)")
print(f"   • Create personalized content based on age segments")
print(f"   • Implement loyalty programs for repeat cusamers")

print(f"\n📊 SUCCESS METRICS TO TRACK:")
print("-" * 40)
print("• Client Lifetime Value (CLV)")
print("• Monthly Recurring Revenus (MRR)")
print("• Client Acquisition Cost (CAC)")
print("• Churn Rate by segment")
print("• Net Promoter Score (NPS)")
print("• Moyen Revenus Per Utiliserr (ARPU)")


In [None]:
# 4. Final Resume Tableau de bord
print("\n" + "=" * 80)
print("📊 TABLEAU DE BORD DE SYNTHESE FINALE")
print("=" * 80)

# Creer un tableau de bord de synthese
fig, axes = plt.subgraphiques(2, 3, figtaille=(20, 12))

# 1. Revenus by Abonnement Type
revenue_by_sub = df.groupby('type_abonnement')['atal_depense'].sum()
revenue_by_sub.graphique(kind='pie', ax=axes[0,0], auapct='%1.1f%%', startangle=90)
axes[0,0].set_titre('Revenus Distribution by Abonnement Type', fontweight='bold')
axes[0,0].set_yetiquette('')

# 2. Client Nombre by Ville
ville_counts = df['ville'].value_counts().head(5)
ville_counts.graphique(kind='bar', ax=axes[0,1], couleur='lightblue')
axes[0,1].set_titre('Top 5 Cities by Client Nombre', fontweight='bold')
axes[0,1].set_xetiquette('Ville')
axes[0,1].set_yetiquette('Number of Clients')
axes[0,1].tick_params(axis='x', rotation=45)

# 3. Satisfaction Score Distribution
df['score_satisfaction'].hist(bins=10, ax=axes[0,2], couleur='lightgreen', alpha=0.7, edgecouleur='black')
axes[0,2].set_titre('Satisfaction Score Distribution', fontweight='bold')
axes[0,2].set_xetiquette('Satisfaction Score')
axes[0,2].set_yetiquette('Number of Clients')
axes[0,2].axvline(df['score_satisfaction'].mean(), couleur='red', linestyle='--', 
                  etiquette=f'Moyenne: {df["score_satisfaction"].mean():.1f}')
axes[0,2].legende()

# 4. Depenses by Age Segment
age_spending = df.groupby('age_segment')['atal_depense'].mean()
age_spending.graphique(kind='bar', ax=axes[1,0], couleur='gold')
axes[1,0].set_titre('Moyen Depenses by Age Segment', fontweight='bold')
axes[1,0].set_xetiquette('Age Segment')
axes[1,0].set_yetiquette('Moyen Total Spent (CHF)')
axes[1,0].tick_params(axis='x', rotation=45)

# 5. Appareil Type Distribution
device_counts = df['type_appareil'].value_counts()
device_counts.graphique(kind='bar', ax=axes[1,1], couleur='lightcoral')
axes[1,1].set_titre('Client Distribution by Appareil Type', fontweight='bold')
axes[1,1].set_xetiquette('Appareil Type')
axes[1,1].set_yetiquette('Number of Clients')
axes[1,1].tick_params(axis='x', rotation=45)

# 6. Cle Metriques Resume
axes[1,2].axis('off')
resume_text = f"""
KEY METRICS SUMMARY

👥 Total Clients: {atal_cusamers:,}
💰 Total Revenus: CHF{atal_revenue:,.0f}
💵 Avg Revenus/Client: CHF{avg_revenue_per_cusamer:.0f}
😊 Avg Satisfaction: {avg_satisfaction:.1f}/10
⏱️ Avg Session Duree: {duree_moyenne_session:.1f} min

TOP INSIGHTS:
• {best_subscription} subscription performs best
• {ap_ville} shows highest spending
• {len(high_value_cusamers)} high-value cusamers identified
• {len(satisfied_cusamers)} satisfied cusamers (8+ rating)
"""

axes[1,2].text(0.1, 0.9, resume_text, transform=axes[1,2].transAxes, 
               fonttaille=12, verticalalignment='ap', fontfamily='monospace',
               bbox=dict(boxstyle="round,pad=0.3", facecouleur="lightgray", alpha=0.5))

plt.tight_layout()
plt.show()

print("✅ EXPLORATION DES DONNEES TERMINEE!")
print("🎉 Votre analyse complete du comportement des clients est prete!")
print("📈 Utiliser these insights a drive donnees-driven business decisions!")
