In [8]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


DB_PATH = '../Data/Processed/hobart_database.db'
conn = sqlite3.connect(str(DB_PATH))

In [9]:
# ========================================================================
# CELLULE 1 : ANALYSE DES √âCARTS TYPES - PARTIE 1
# Analyse des √©carts types par Process, Desk, Activity et Cat√©gorie
# ========================================================================

import numpy as np
from scipy import stats
from scipy.stats import normaltest, shapiro, kstest, lognorm

print("="*80)
print("üìä ANALYSE DES √âCARTS TYPES PAR DIMENSION")
print("="*80)

# ============================================================================
# 1. √âCART TYPE PAR CAT√âGORIE
# ============================================================================
print("\n" + "="*80)
print("1Ô∏è‚É£ ANALYSE PAR CAT√âGORIE")
print("="*80)

# Requ√™te simplifi√©e sans quantiles (on les calculera apr√®s)
category_stats = pd.read_sql_query("""
SELECT 
    c.ID as category_id,
    c.NAME as category_name,
    COUNT(sr.ID) as total_srs,
    
    -- Temps de traitement en heures
    ROUND(AVG(CAST((julianday(sr.CLOSINGDATE) - julianday(sr.CREATIONDATE)) * 24 AS REAL)), 2) as mean_hours,
    ROUND(MIN(CAST((julianday(sr.CLOSINGDATE) - julianday(sr.CREATIONDATE)) * 24 AS REAL)), 2) as min_hours,
    ROUND(MAX(CAST((julianday(sr.CLOSINGDATE) - julianday(sr.CREATIONDATE)) * 24 AS REAL)), 2) as max_hours
    
FROM sr
LEFT JOIN category c ON sr.CATEGORY_ID = c.ID
WHERE sr.CLOSINGDATE IS NOT NULL
  AND sr.CREATIONDATE IS NOT NULL
  AND strftime('%Y-%m', sr.CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
GROUP BY c.ID, c.NAME
HAVING COUNT(sr.ID) >= 100
ORDER BY COUNT(sr.ID) DESC
""", conn)

# Calculer l'√©cart type et les quantiles pour chaque cat√©gorie
print("\n‚è≥ Calcul des √©carts types et quantiles par cat√©gorie (peut prendre quelques secondes)...")

std_devs = []
skewness_vals = []
kurtosis_vals = []
cv_vals = []
q25_vals = []
median_vals = []
q75_vals = []
p95_vals = []

for idx, row in category_stats.iterrows():
    cat_id = row['category_id']
    
    # R√©cup√©rer les temps de traitement pour cette cat√©gorie
    times_query = f"""
    SELECT CAST((julianday(CLOSINGDATE) - julianday(CREATIONDATE)) * 24 AS REAL) as hours
    FROM sr
    WHERE CATEGORY_ID = {cat_id}
      AND CLOSINGDATE IS NOT NULL
      AND CREATIONDATE IS NOT NULL
      AND strftime('%Y-%m', CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
    """
    
    times_df = pd.read_sql_query(times_query, conn)
    times = times_df['hours'].values
    
    if len(times) > 0:
        # √âcart type
        std_dev = np.std(times, ddof=1)  # √âcart type corrig√©
        std_devs.append(std_dev)
        
        # Coefficient de variation (CV = std/mean)
        cv = (std_dev / row['mean_hours']) * 100 if row['mean_hours'] > 0 else 0
        cv_vals.append(cv)
        
        # Quantiles
        q25 = np.percentile(times, 25)
        q50 = np.percentile(times, 50)  # m√©diane
        q75 = np.percentile(times, 75)
        p95 = np.percentile(times, 95)
        
        q25_vals.append(round(q25, 2))
        median_vals.append(round(q50, 2))
        q75_vals.append(round(q75, 2))
        p95_vals.append(round(p95, 2))
        
        # Skewness et Kurtosis
        if len(times) >= 3:
            skew = stats.skew(times)
            kurt = stats.kurtosis(times)
            skewness_vals.append(skew)
            kurtosis_vals.append(kurt)
        else:
            skewness_vals.append(np.nan)
            kurtosis_vals.append(np.nan)
    else:
        std_devs.append(np.nan)
        cv_vals.append(np.nan)
        q25_vals.append(np.nan)
        median_vals.append(np.nan)
        q75_vals.append(np.nan)
        p95_vals.append(np.nan)
        skewness_vals.append(np.nan)
        kurtosis_vals.append(np.nan)
    
    # Afficher progression tous les 50 cat√©gories
    if (idx + 1) % 50 == 0:
        print(f"   Trait√© {idx + 1}/{len(category_stats)} cat√©gories...")

category_stats['std_dev'] = std_devs
category_stats['cv_pct'] = cv_vals
category_stats['q25'] = q25_vals
category_stats['median'] = median_vals
category_stats['q75'] = q75_vals
category_stats['p95'] = p95_vals
category_stats['skewness'] = skewness_vals
category_stats['kurtosis'] = kurtosis_vals

# IQR (Interquartile Range)
category_stats['iqr'] = category_stats['q75'] - category_stats['q25']

# Outlier bounds (Tukey's fences)
category_stats['lower_fence'] = category_stats['q25'] - 1.5 * category_stats['iqr']
category_stats['upper_fence'] = category_stats['q75'] + 1.5 * category_stats['iqr']

print(f"\n‚úÖ Analyse termin√©e pour {len(category_stats)} cat√©gories\n")

# Afficher le top 20
print("\nüìä TOP 20 CAT√âGORIES PAR VOLUME (avec statistiques d√©taill√©es)\n")
display_cols = ['category_name', 'total_srs', 'mean_hours', 'std_dev', 'cv_pct', 
                'median', 'iqr', 'skewness', 'kurtosis']
display(category_stats.head(20)[display_cols])

# Identifier cat√©gories avec plus grande variabilit√©
print("\n" + "="*80)
print("üéØ CAT√âGORIES AVEC PLUS GRANDE VARIABILIT√â (CV > 100%)")
print("="*80)
print("\nCV (Coefficient de Variation) = (√âcart Type / Moyenne) √ó 100")
print("CV √©lev√© = Grande dispersion des temps de traitement\n")

high_variability = category_stats[category_stats['cv_pct'] > 100].sort_values('cv_pct', ascending=False)
if len(high_variability) > 0:
    display(high_variability.head(15)[display_cols])
else:
    print("‚úÖ Aucune cat√©gorie avec CV > 100% (toutes ont une variabilit√© relative < 100%)")

# ============================================================================
# 2. √âCART TYPE PAR DESK
# ============================================================================
print("\n" + "="*80)
print("2Ô∏è‚É£ ANALYSE PAR DESK")
print("="*80)

desk_stats = pd.read_sql_query("""
SELECT 
    sr.JUR_DESK_ID as desk_id,
    COUNT(sr.ID) as total_srs,
    ROUND(AVG(CAST((julianday(sr.CLOSINGDATE) - julianday(sr.CREATIONDATE)) * 24 AS REAL)), 2) as mean_hours,
    ROUND(MIN(CAST((julianday(sr.CLOSINGDATE) - julianday(sr.CREATIONDATE)) * 24 AS REAL)), 2) as min_hours,
    ROUND(MAX(CAST((julianday(sr.CLOSINGDATE) - julianday(sr.CREATIONDATE)) * 24 AS REAL)), 2) as max_hours
FROM sr
WHERE sr.CLOSINGDATE IS NOT NULL
  AND sr.CREATIONDATE IS NOT NULL
  AND sr.JUR_DESK_ID IS NOT NULL
  AND strftime('%Y-%m', sr.CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
GROUP BY sr.JUR_DESK_ID
HAVING COUNT(sr.ID) >= 100
ORDER BY COUNT(sr.ID) DESC
LIMIT 50
""", conn)

print("\n‚è≥ Calcul des √©carts types par desk...")

desk_std_devs = []
desk_cv_vals = []

for idx, row in desk_stats.iterrows():
    desk_id = row['desk_id']
    
    times_query = f"""
    SELECT CAST((julianday(CLOSINGDATE) - julianday(CREATIONDATE)) * 24 AS REAL) as hours
    FROM sr
    WHERE JUR_DESK_ID = {desk_id}
      AND CLOSINGDATE IS NOT NULL
      AND CREATIONDATE IS NOT NULL
      AND strftime('%Y-%m', CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
    LIMIT 10000
    """
    
    times_df = pd.read_sql_query(times_query, conn)
    times = times_df['hours'].values
    
    if len(times) > 0:
        std_dev = np.std(times, ddof=1)
        desk_std_devs.append(std_dev)
        cv = (std_dev / row['mean_hours']) * 100 if row['mean_hours'] > 0 else 0
        desk_cv_vals.append(cv)
    else:
        desk_std_devs.append(np.nan)
        desk_cv_vals.append(np.nan)

desk_stats['std_dev'] = desk_std_devs
desk_stats['cv_pct'] = desk_cv_vals

print(f"\n‚úÖ Analyse termin√©e pour {len(desk_stats)} desks\n")

print("\nüìä TOP 20 DESKS PAR VOLUME (avec √©carts types)\n")
display(desk_stats.head(20))

# ============================================================================
# 3. VISUALISATIONS
# ============================================================================
print("\n" + "="*80)
print("üìà VISUALISATIONS")
print("="*80)

# Graphique 1 : Distribution des CV par cat√©gorie
fig1 = px.histogram(
    category_stats,
    x='cv_pct',
    nbins=50,
    title='Distribution du Coefficient de Variation (CV) par Cat√©gorie',
    labels={'cv_pct': 'Coefficient de Variation (%)', 'count': 'Nombre de Cat√©gories'},
    color_discrete_sequence=['steelblue']
)
fig1.add_vline(x=100, line_dash="dash", line_color="red", 
               annotation_text="CV = 100%", annotation_position="top right")
fig1.update_layout(height=500, showlegend=False)
fig1.show()

# Graphique 2 : Moyenne vs √âcart Type (Top 30 cat√©gories)
top30_cat = category_stats.head(30)
fig2 = px.scatter(
    top30_cat,
    x='mean_hours',
    y='std_dev',
    size='total_srs',
    color='cv_pct',
    hover_name='category_name',
    title='Moyenne vs √âcart Type (Top 30 Cat√©gories)',
    labels={
        'mean_hours': 'Temps Moyen (heures)',
        'std_dev': '√âcart Type (heures)',
        'total_srs': 'Volume SRs',
        'cv_pct': 'CV (%)'
    },
    color_continuous_scale='Viridis',
    size_max=60
)
fig2.update_layout(height=600)
fig2.show()

# Graphique 3 : Box plots des top 15 cat√©gories
print("\n‚è≥ G√©n√©ration des box plots (peut prendre quelques secondes)...")

# R√©cup√©rer les donn√©es pour les top 15 cat√©gories
top15_ids = category_stats.head(15)['category_id'].tolist()
boxplot_data = []

for cat_id in top15_ids:
    cat_name = category_stats[category_stats['category_id'] == cat_id]['category_name'].values[0]
    
    times_query = f"""
    SELECT CAST((julianday(CLOSINGDATE) - julianday(CREATIONDATE)) * 24 AS REAL) as hours
    FROM sr
    WHERE CATEGORY_ID = {cat_id}
      AND CLOSINGDATE IS NOT NULL
      AND CREATIONDATE IS NOT NULL
      AND strftime('%Y-%m', CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
    LIMIT 5000
    """
    
    times_df = pd.read_sql_query(times_query, conn)
    times_df['category'] = cat_name
    boxplot_data.append(times_df)

boxplot_df = pd.concat(boxplot_data, ignore_index=True)

fig3 = px.box(
    boxplot_df,
    x='category',
    y='hours',
    title='Distribution des Temps de Traitement (Top 15 Cat√©gories)',
    labels={'category': 'Cat√©gorie', 'hours': 'Temps de Traitement (heures)'},
    color='category'
)
fig3.update_layout(
    height=600,
    xaxis_tickangle=-45,
    showlegend=False,
    yaxis_type="log"  # √âchelle log pour mieux voir la distribution
)
fig3.show()

print("\n‚úÖ Cellule 1 termin√©e : Statistiques descriptives et √©carts types calcul√©s")

üìä ANALYSE DES √âCARTS TYPES PAR DIMENSION

1Ô∏è‚É£ ANALYSE PAR CAT√âGORIE

‚è≥ Calcul des √©carts types et quantiles par cat√©gorie (peut prendre quelques secondes)...
   Trait√© 50/892 cat√©gories...
   Trait√© 100/892 cat√©gories...
   Trait√© 150/892 cat√©gories...
   Trait√© 200/892 cat√©gories...
   Trait√© 250/892 cat√©gories...
   Trait√© 300/892 cat√©gories...
   Trait√© 350/892 cat√©gories...
   Trait√© 400/892 cat√©gories...
   Trait√© 450/892 cat√©gories...
   Trait√© 500/892 cat√©gories...
   Trait√© 550/892 cat√©gories...
   Trait√© 600/892 cat√©gories...
   Trait√© 650/892 cat√©gories...
   Trait√© 700/892 cat√©gories...
   Trait√© 750/892 cat√©gories...
   Trait√© 800/892 cat√©gories...
   Trait√© 850/892 cat√©gories...

‚úÖ Analyse termin√©e pour 892 cat√©gories


üìä TOP 20 CAT√âGORIES PAR VOLUME (avec statistiques d√©taill√©es)



Unnamed: 0,category_name,total_srs,mean_hours,std_dev,cv_pct,median,iqr,skewness,kurtosis
0,Tax,100680,457.93,1145.112049,250.062684,69.47,327.21,4.906952,31.569204
1,Cash instruction,91822,53.72,268.400249,499.628163,1.01,17.34,14.620191,337.679278
2,BAU Asset Creation,82171,21.78,199.664662,916.733984,1.58,4.05,26.383041,1027.993237
3,CREST,76847,28.73,232.244315,808.368657,0.47,1.94,24.989981,935.724844
4,Others,75748,72.0,355.119635,493.221716,1.01,17.82,12.287035,213.297626
5,Investigation Level 1,71499,65.01,400.030033,615.336152,1.26,17.87,15.186523,320.20742
6,Settlement,64965,116.3,432.661663,372.022066,17.88,72.95,10.676841,168.738158
7,FRPP,63229,65.41,353.605892,540.599132,1.07,17.63,14.252569,282.844295
8,ESMX,60837,26.19,210.811614,804.931705,0.49,2.64,21.815915,653.413083
9,DEFF,58446,38.88,260.947376,671.160946,0.73,5.61,21.18322,651.889707



üéØ CAT√âGORIES AVEC PLUS GRANDE VARIABILIT√â (CV > 100%)

CV (Coefficient de Variation) = (√âcart Type / Moyenne) √ó 100
CV √©lev√© = Grande dispersion des temps de traitement



Unnamed: 0,category_name,total_srs,mean_hours,std_dev,cv_pct,median,iqr,skewness,kurtosis
115,Signoff,8084,0.62,23.298059,3757.751448,0.15,0.23,89.035707,7974.160334
80,KPI Dublin,13214,0.97,34.259295,3531.886039,0.19,0.21,69.033681,5020.744834
93,Extraction,10708,0.86,23.578312,2741.664163,0.0,0.0,95.65289,9615.675983
287,Variation,2246,2.24,59.548997,2658.437388,0.02,0.09,43.046891,1939.451681
184,KPI Luxembourg AM,4278,2.25,54.856512,2438.067201,0.31,0.35,44.194066,2003.137452
10,KPI PARIS,56618,4.24,83.940667,1979.732708,0.24,0.41,51.728448,3473.086267
11,Lux Non-IP,55602,7.39,140.011542,1894.608142,0.29,0.53,62.476486,5254.368631
127,Singapore,7200,11.9,224.857253,1889.556749,0.21,0.28,36.917784,1556.25529
61,Lux IP,17659,6.86,125.689692,1832.211248,0.13,0.32,40.442283,2041.546035
71,MILAN,15306,6.12,99.609442,1627.605263,0.25,0.32,30.897867,1147.55784



2Ô∏è‚É£ ANALYSE PAR DESK

‚è≥ Calcul des √©carts types par desk...

‚úÖ Analyse termin√©e pour 50 desks


üìä TOP 20 DESKS PAR VOLUME (avec √©carts types)



Unnamed: 0,desk_id,total_srs,mean_hours,min_hours,max_hours,std_dev,cv_pct
0,73044,131442,9.14,0.0,14691.62,212.932621,2329.678571
1,73049,82505,24.05,0.0,14335.55,332.975077,1384.511753
2,73246,73101,27.64,0.0,14731.21,321.087268,1161.67608
3,73046,64025,3.83,0.0,8110.86,111.34414,2907.157705
4,73059,60202,24.98,0.0,8799.27,180.772378,723.668449
5,72956,50130,121.34,0.0,13272.4,591.688147,487.628274
6,83807,49647,102.49,0.0,9175.55,519.281173,506.665209
7,77933,48009,447.59,0.0,11563.65,900.683758,201.229643
8,73203,45747,70.84,0.0,14806.01,472.389585,666.840182
9,73195,45634,36.29,0.0,14034.69,424.917244,1170.893481



üìà VISUALISATIONS



‚è≥ G√©n√©ration des box plots (peut prendre quelques secondes)...



‚úÖ Cellule 1 termin√©e : Statistiques descriptives et √©carts types calcul√©s


In [10]:
# ========================================================================
# CELLULE 2 : ANALYSE DES LOIS DE PROBABILIT√â ET TESTS STATISTIQUES
# Tests de normalit√©, ajustement de distributions, analyse d'outliers
# ========================================================================

from scipy.stats import norm, lognorm, expon, gamma, weibull_min, anderson, jarque_bera
from scipy.optimize import curve_fit
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("üìä ANALYSE DES LOIS DE PROBABILIT√â ET TESTS STATISTIQUES")
print("="*80)

# ============================================================================
# 1. TESTS DE NORMALIT√â SUR LES TOP CAT√âGORIES
# ============================================================================
print("\n" + "="*80)
print("1Ô∏è‚É£ TESTS DE NORMALIT√â (Top 15 Cat√©gories)")
print("="*80)
print("\nTests effectu√©s : Shapiro-Wilk, Anderson-Darling, Jarque-Bera")
print("H0 (hypoth√®se nulle) : Les donn√©es suivent une distribution normale\n")

normality_results = []

for idx, row in category_stats.head(15).iterrows():
    cat_id = row['category_id']
    cat_name = row['category_name']
    
    # R√©cup√©rer √©chantillon de temps
    times_query = f"""
    SELECT CAST((julianday(CLOSINGDATE) - julianday(CREATIONDATE)) * 24 AS REAL) as hours
    FROM sr
    WHERE CATEGORY_ID = {cat_id}
      AND CLOSINGDATE IS NOT NULL
      AND CREATIONDATE IS NOT NULL
      AND strftime('%Y-%m', CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
    LIMIT 5000
    """
    
    times_df = pd.read_sql_query(times_query, conn)
    times = times_df['hours'].values
    
    if len(times) >= 20:  # Minimum pour tests statistiques
        # Test de Shapiro-Wilk (max 5000 √©chantillons)
        sample_size = min(5000, len(times))
        sample = np.random.choice(times, sample_size, replace=False)
        
        shapiro_stat, shapiro_p = shapiro(sample)
        
        # Test de Anderson-Darling
        anderson_result = anderson(sample, dist='norm')
        anderson_stat = anderson_result.statistic
        
        # Test de Jarque-Bera
        jb_stat, jb_p = jarque_bera(sample)
        
        # Conclusion
        is_normal = shapiro_p > 0.05 and jb_p > 0.05
        
        normality_results.append({
            'category': cat_name[:30],
            'sample_size': len(times),
            'shapiro_stat': shapiro_stat,
            'shapiro_p': shapiro_p,
            'anderson_stat': anderson_stat,
            'jb_stat': jb_stat,
            'jb_p': jb_p,
            'is_normal': is_normal,
            'mean': np.mean(times),
            'std': np.std(times),
            'skewness': stats.skew(times),
            'kurtosis': stats.kurtosis(times)
        })

normality_df = pd.DataFrame(normality_results)
display(normality_df[['category', 'sample_size', 'shapiro_p', 'jb_p', 'is_normal', 
                       'skewness', 'kurtosis']])

normal_count = normality_df['is_normal'].sum()
print(f"\nüìä R√©sultat : {normal_count}/{len(normality_df)} cat√©gories suivent une distribution normale (Œ±=0.05)")
print(f"   ‚Üí {len(normality_df) - normal_count} cat√©gories ont des distributions non-normales")

# ============================================================================
# 2. AJUSTEMENT DE DISTRIBUTIONS (LOG-NORMALE, EXPONENTIELLE, GAMMA)
# ============================================================================
print("\n" + "="*80)
print("2Ô∏è‚É£ AJUSTEMENT DE DISTRIBUTIONS ALTERNATIVES")
print("="*80)
print("\nPour les cat√©gories non-normales, test des distributions :")
print("  ‚Ä¢ Log-normale (temps avec longue tra√Æne)")
print("  ‚Ä¢ Exponentielle (processus memoryless)")
print("  ‚Ä¢ Gamma (somme de processus exponentiels)\n")

# S√©lectionner 5 cat√©gories non-normales pour analyse d√©taill√©e
non_normal_cats = normality_df[~normality_df['is_normal']].head(5)

distribution_fits = []

for idx, row in non_normal_cats.iterrows():
    cat_name = row['category']
    cat_id = category_stats[category_stats['category_name'].str[:30] == cat_name]['category_id'].values[0]
    
    # R√©cup√©rer donn√©es
    times_query = f"""
    SELECT CAST((julianday(CLOSINGDATE) - julianday(CREATIONDATE)) * 24 AS REAL) as hours
    FROM sr
    WHERE CATEGORY_ID = {cat_id}
      AND CLOSINGDATE IS NOT NULL
      AND CREATIONDATE IS NOT NULL
      AND strftime('%Y-%m', CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
    LIMIT 5000
    """
    
    times_df = pd.read_sql_query(times_query, conn)
    times = times_df['hours'].values
    times = times[times > 0]  # Enlever z√©ros pour log-normale
    
    if len(times) > 50:
        # Ajuster log-normale
        shape_ln, loc_ln, scale_ln = lognorm.fit(times, floc=0)
        
        # Ajuster exponentielle
        loc_exp, scale_exp = expon.fit(times)
        
        # Ajuster gamma
        shape_gamma, loc_gamma, scale_gamma = gamma.fit(times)
        
        # Calculer KS test pour chaque distribution
        ks_lognorm = kstest(times, lambda x: lognorm.cdf(x, shape_ln, loc_ln, scale_ln))
        ks_expon = kstest(times, lambda x: expon.cdf(x, loc_exp, scale_exp))
        ks_gamma = kstest(times, lambda x: gamma.cdf(x, shape_gamma, loc_gamma, scale_gamma))
        
        # Meilleur fit (KS stat le plus bas)
        ks_stats = {
            'Log-normale': ks_lognorm.statistic,
            'Exponentielle': ks_expon.statistic,
            'Gamma': ks_gamma.statistic
        }
        best_fit = min(ks_stats, key=ks_stats.get)
        
        distribution_fits.append({
            'category': cat_name,
            'best_fit': best_fit,
            'ks_lognorm': ks_lognorm.statistic,
            'ks_expon': ks_expon.statistic,
            'ks_gamma': ks_gamma.statistic,
            'lognorm_params': f"œÉ={shape_ln:.2f}, Œº={scale_ln:.2f}",
            'expon_params': f"Œª={1/scale_exp:.4f}",
            'gamma_params': f"k={shape_gamma:.2f}, Œ∏={scale_gamma:.2f}"
        })

if len(distribution_fits) > 0:
    fit_df = pd.DataFrame(distribution_fits)
    display(fit_df)
    
    print("\nüìä KS Statistic : Plus faible = Meilleur ajustement")
    print(f"   Distributions majoritaires : {fit_df['best_fit'].value_counts().to_dict()}")

# ============================================================================
# 3. ANALYSE DES OUTLIERS (VALEURS ABERRANTES)
# ============================================================================
print("\n" + "="*80)
print("3Ô∏è‚É£ D√âTECTION DES OUTLIERS (Top 10 Cat√©gories)")
print("="*80)
print("\nM√©thode : Tukey's Fences (IQR √ó 1.5)")
print("Outliers = Valeurs < Q1 - 1.5√óIQR  OU  > Q3 + 1.5√óIQR\n")

outlier_analysis = []

for idx, row in category_stats.head(10).iterrows():
    cat_id = row['category_id']
    cat_name = row['category_name']
    q25, q75 = row['q25'], row['q75']
    iqr = row['iqr']
    lower_fence = row['lower_fence']
    upper_fence = row['upper_fence']
    
    # Compter les outliers
    outlier_query = f"""
    SELECT 
        COUNT(*) as total,
        COUNT(CASE WHEN hours < {lower_fence} THEN 1 END) as lower_outliers,
        COUNT(CASE WHEN hours > {upper_fence} THEN 1 END) as upper_outliers,
        ROUND(AVG(CASE WHEN hours > {upper_fence} THEN hours END), 2) as avg_upper_outlier
    FROM (
        SELECT CAST((julianday(CLOSINGDATE) - julianday(CREATIONDATE)) * 24 AS REAL) as hours
        FROM sr
        WHERE CATEGORY_ID = {cat_id}
          AND CLOSINGDATE IS NOT NULL
          AND CREATIONDATE IS NOT NULL
          AND strftime('%Y-%m', CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
    )
    """
    
    outlier_df = pd.read_sql_query(outlier_query, conn)
    
    total_outliers = outlier_df['lower_outliers'].values[0] + outlier_df['upper_outliers'].values[0]
    outlier_pct = (total_outliers / outlier_df['total'].values[0]) * 100
    
    outlier_analysis.append({
        'category': cat_name[:30],
        'total_srs': outlier_df['total'].values[0],
        'outliers': total_outliers,
        'outlier_pct': round(outlier_pct, 2),
        'lower_outliers': outlier_df['lower_outliers'].values[0],
        'upper_outliers': outlier_df['upper_outliers'].values[0],
        'upper_fence_hours': round(upper_fence, 2),
        'avg_upper_outlier': outlier_df['avg_upper_outlier'].values[0]
    })

outlier_df_final = pd.DataFrame(outlier_analysis)
display(outlier_df_final)

print(f"\nüìä Moyenne d'outliers : {outlier_df_final['outlier_pct'].mean():.2f}%")
print(f"   Cat√©gorie avec plus d'outliers : {outlier_df_final.loc[outlier_df_final['outlier_pct'].idxmax(), 'category']}")

# ============================================================================
# 4. VISUALISATIONS AVANC√âES
# ============================================================================
print("\n" + "="*80)
print("üìà VISUALISATIONS STATISTIQUES")
print("="*80)

# Graphique 1 : Q-Q Plot pour v√©rifier normalit√© (Top 6 cat√©gories)
print("\n‚è≥ G√©n√©ration des Q-Q plots...")

fig_qq = make_subplots(
    rows=2, cols=3,
    subplot_titles=[cat[:20] for cat in category_stats.head(6)['category_name'].tolist()]
)

positions = [(1,1), (1,2), (1,3), (2,1), (2,2), (2,3)]

for idx, (row, pos) in enumerate(zip(category_stats.head(6).iterrows(), positions)):
    _, cat_row = row
    cat_id = cat_row['category_id']
    
    # R√©cup√©rer √©chantillon
    times_query = f"""
    SELECT CAST((julianday(CLOSINGDATE) - julianday(CREATIONDATE)) * 24 AS REAL) as hours
    FROM sr
    WHERE CATEGORY_ID = {cat_id}
      AND CLOSINGDATE IS NOT NULL
      AND CREATIONDATE IS NOT NULL
      AND strftime('%Y-%m', CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
    LIMIT 1000
    """
    
    times_df = pd.read_sql_query(times_query, conn)
    times = times_df['hours'].values
    
    # Q-Q plot data
    (osm, osr), (slope, intercept, r) = stats.probplot(times, dist="norm")
    
    fig_qq.add_trace(
        go.Scatter(x=osm, y=osr, mode='markers', marker=dict(size=3, color='steelblue'),
                   name=f'Data', showlegend=False),
        row=pos[0], col=pos[1]
    )
    
    # Ligne th√©orique
    fig_qq.add_trace(
        go.Scatter(x=osm, y=slope*osm + intercept, mode='lines', 
                   line=dict(color='red', dash='dash'), name='Normal', showlegend=False),
        row=pos[0], col=pos[1]
    )

fig_qq.update_layout(height=700, title_text="Q-Q Plots : Test de Normalit√© (Top 6 Cat√©gories)")
fig_qq.show()

# Graphique 2 : Histogrammes avec ajustement de distributions
print("\n‚è≥ G√©n√©ration des histogrammes avec ajustements...")

# Prendre 3 cat√©gories pour analyse d√©taill√©e
selected_cats = category_stats.head(3)

for _, cat_row in selected_cats.iterrows():
    cat_id = cat_row['category_id']
    cat_name = cat_row['category_name']
    
    # R√©cup√©rer donn√©es
    times_query = f"""
    SELECT CAST((julianday(CLOSINGDATE) - julianday(CREATIONDATE)) * 24 AS REAL) as hours
    FROM sr
    WHERE CATEGORY_ID = {cat_id}
      AND CLOSINGDATE IS NOT NULL
      AND CREATIONDATE IS NOT NULL
      AND strftime('%Y-%m', CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
    LIMIT 5000
    """
    
    times_df = pd.read_sql_query(times_query, conn)
    times = times_df['hours'].values
    times = times[times > 0]
    
    # Cr√©er histogramme
    fig_hist = go.Figure()
    
    # Histogram
    fig_hist.add_trace(go.Histogram(
        x=times,
        nbinsx=50,
        name='Donn√©es r√©elles',
        opacity=0.7,
        histnorm='probability density'
    ))
    
    # Ajuster log-normale
    if len(times) > 50:
        shape, loc, scale = lognorm.fit(times, floc=0)
        x_range = np.linspace(times.min(), np.percentile(times, 95), 1000)
        pdf = lognorm.pdf(x_range, shape, loc, scale)
        
        fig_hist.add_trace(go.Scatter(
            x=x_range,
            y=pdf,
            mode='lines',
            name='Log-normale',
            line=dict(color='red', width=2)
        ))
    
    fig_hist.update_layout(
        title=f'Distribution des Temps de Traitement : {cat_name}',
        xaxis_title='Temps (heures)',
        yaxis_title='Densit√© de Probabilit√©',
        height=500,
        showlegend=True
    )
    
    fig_hist.show()

# Graphique 3 : Coefficient de Variation par Desk (Top 30)
fig_cv_desk = px.bar(
    desk_stats.head(30).sort_values('cv_pct', ascending=True),
    x='cv_pct',
    y='desk_id',
    orientation='h',
    title='Coefficient de Variation par Desk (Top 30)',
    labels={'cv_pct': 'CV (%)', 'desk_id': 'Desk ID'},
    color='cv_pct',
    color_continuous_scale='RdYlGn_r',
    text='cv_pct'
)
fig_cv_desk.update_traces(texttemplate='%{text:.1f}%', textposition='outside')
fig_cv_desk.update_layout(height=800)
fig_cv_desk.show()

print("\n‚úÖ Cellule 2 termin√©e : Tests statistiques et ajustements de distributions effectu√©s")

üìä ANALYSE DES LOIS DE PROBABILIT√â ET TESTS STATISTIQUES

1Ô∏è‚É£ TESTS DE NORMALIT√â (Top 15 Cat√©gories)

Tests effectu√©s : Shapiro-Wilk, Anderson-Darling, Jarque-Bera
H0 (hypoth√®se nulle) : Les donn√©es suivent une distribution normale



Unnamed: 0,category,sample_size,shapiro_p,jb_p,is_normal,skewness,kurtosis
0,Tax,5000,4.380095e-83,0.0,False,4.770758,28.964111
1,Cash instruction,5000,1.264809e-91,0.0,False,14.566427,290.776927
2,BAU Asset Creation,5000,5.033353e-94,0.0,False,23.722934,739.597827
3,CREST,5000,1.870358e-92,0.0,False,14.261996,259.195253
4,Others,5000,1.378058e-90,0.0,False,11.399812,179.598483
5,Investigation Level 1,5000,1.523229e-92,0.0,False,14.201594,266.162205
6,Settlement,5000,8.328545e-89,0.0,False,9.719193,136.968419
7,FRPP,5000,1.28453e-91,0.0,False,12.312443,195.321961
8,ESMX,5000,2.09347e-93,0.0,False,21.750038,634.70032
9,DEFF,5000,2.586004e-92,0.0,False,16.662075,398.351304



üìä R√©sultat : 0/15 cat√©gories suivent une distribution normale (Œ±=0.05)
   ‚Üí 15 cat√©gories ont des distributions non-normales

2Ô∏è‚É£ AJUSTEMENT DE DISTRIBUTIONS ALTERNATIVES

Pour les cat√©gories non-normales, test des distributions :
  ‚Ä¢ Log-normale (temps avec longue tra√Æne)
  ‚Ä¢ Exponentielle (processus memoryless)
  ‚Ä¢ Gamma (somme de processus exponentiels)



Unnamed: 0,category,best_fit,ks_lognorm,ks_expon,ks_gamma,lognorm_params,expon_params,gamma_params
0,Tax,Log-normale,0.128532,0.388422,0.130546,"œÉ=3.53, Œº=23.85",Œª=0.0022,"k=0.22, Œ∏=3867.19"
1,Cash instruction,Log-normale,0.058932,0.603119,0.981689,"œÉ=3.15, Œº=1.38",Œª=0.0162,"k=0.00, Œ∏=4.06"
2,BAU Asset Creation,Log-normale,0.09549,0.61048,0.98186,"œÉ=2.52, Œº=1.10",Œª=0.0394,"k=0.00, Œ∏=3.79"
3,CREST,Log-normale,0.068225,0.659368,0.964021,"œÉ=2.47, Œº=0.63",Œª=0.0463,"k=0.01, Œ∏=3.70"
4,Others,Log-normale,0.059159,0.603596,0.972357,"œÉ=3.28, Œº=1.29",Œª=0.0132,"k=0.00, Œ∏=4.11"



üìä KS Statistic : Plus faible = Meilleur ajustement
   Distributions majoritaires : {'Log-normale': 5}

3Ô∏è‚É£ D√âTECTION DES OUTLIERS (Top 10 Cat√©gories)

M√©thode : Tukey's Fences (IQR √ó 1.5)
Outliers = Valeurs < Q1 - 1.5√óIQR  OU  > Q3 + 1.5√óIQR



Unnamed: 0,category,total_srs,outliers,outlier_pct,lower_outliers,upper_outliers,upper_fence_hours,avg_upper_outlier
0,Tax,100680,13902,13.81,0,13902,821.07,2556.6
1,Cash instruction,91822,15055,16.4,0,15055,43.47,308.19
2,BAU Asset Creation,82171,7876,9.58,0,7876,10.45,207.38
3,CREST,76847,13798,17.96,0,13798,4.96,156.91
4,Others,75748,12917,17.05,0,12917,44.63,402.72
5,Investigation Level 1,71499,10854,15.18,0,10854,44.79,403.11
6,Settlement,64965,7603,11.7,0,7603,183.23,770.24
7,FRPP,63229,10055,15.9,0,10055,44.25,390.14
8,ESMX,60837,10125,16.64,0,10125,6.68,152.81
9,DEFF,58446,12851,21.99,0,12851,14.15,172.98



üìä Moyenne d'outliers : 15.62%
   Cat√©gorie avec plus d'outliers : DEFF

üìà VISUALISATIONS STATISTIQUES

‚è≥ G√©n√©ration des Q-Q plots...



‚è≥ G√©n√©ration des histogrammes avec ajustements...



‚úÖ Cellule 2 termin√©e : Tests statistiques et ajustements de distributions effectu√©s


In [11]:
# ========================================================================
# CELLULE 3 : M√âDIANE DU TEMPS DE TRAITEMENT DES SR
# Calcul de la m√©diane globale et par dimension (Cat√©gorie, Desk, Activity)
# ========================================================================

print("="*80)
print("üìä ANALYSE DE LA M√âDIANE DU TEMPS DE TRAITEMENT")
print("="*80)

# ============================================================================
# 1. M√âDIANE GLOBALE
# ============================================================================
print("\n" + "="*80)
print("1Ô∏è‚É£ M√âDIANE GLOBALE DU TEMPS DE TRAITEMENT")
print("="*80)

# R√©cup√©rer tous les temps de traitement pour calcul des percentiles
print("‚è≥ Calcul des percentiles globaux...")

global_times = pd.read_sql_query("""
SELECT CAST((julianday(CLOSINGDATE) - julianday(CREATIONDATE)) * 24 AS REAL) as hours
FROM sr
WHERE CLOSINGDATE IS NOT NULL
  AND CREATIONDATE IS NOT NULL
  AND strftime('%Y-%m', CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
""", conn)

times_array = global_times['hours'].values

# Calculer les statistiques
global_stats = {
    'total_srs': len(times_array),
    'mean_hours': round(np.mean(times_array), 2),
    'median_hours': round(np.percentile(times_array, 50), 2),
    'q25_hours': round(np.percentile(times_array, 25), 2),
    'q75_hours': round(np.percentile(times_array, 75), 2),
    'p90_hours': round(np.percentile(times_array, 90), 2),
    'p95_hours': round(np.percentile(times_array, 95), 2),
    'p99_hours': round(np.percentile(times_array, 99), 2)
}

global_median = pd.DataFrame([global_stats])

print("\nüìä STATISTIQUES GLOBALES (Tous SRs ferm√©s)\n")
print(f"Total SRs analys√©s : {global_stats['total_srs']:,}")
print(f"\nüéØ TEMPS DE TRAITEMENT :")
print(f"   ‚Ä¢ Moyenne        : {global_stats['mean_hours']:.2f}h ({global_stats['mean_hours']/24:.2f}j)")
print(f"   ‚Ä¢ M√©diane (P50)  : {global_stats['median_hours']:.2f}h ({global_stats['median_hours']/24:.2f}j)")
print(f"\nüìà QUARTILES :")
print(f"   ‚Ä¢ Q1 (P25)       : {global_stats['q25_hours']:.2f}h ({global_stats['q25_hours']/24:.2f}j)")
print(f"   ‚Ä¢ Q3 (P75)       : {global_stats['q75_hours']:.2f}h ({global_stats['q75_hours']/24:.2f}j)")
print(f"   ‚Ä¢ IQR            : {global_stats['q75_hours'] - global_stats['q25_hours']:.2f}h")
print(f"\nüîù PERCENTILES √âLEV√âS :")
print(f"   ‚Ä¢ P90            : {global_stats['p90_hours']:.2f}h ({global_stats['p90_hours']/24:.2f}j)")
print(f"   ‚Ä¢ P95            : {global_stats['p95_hours']:.2f}h ({global_stats['p95_hours']/24:.2f}j)")
print(f"   ‚Ä¢ P99            : {global_stats['p99_hours']:.2f}h ({global_stats['p99_hours']/24:.2f}j)")

display(global_median)

# Diff√©rence moyenne vs m√©diane
diff_pct = ((global_stats['mean_hours'] - global_stats['median_hours']) / 
            global_stats['median_hours']) * 100

print(f"\nüí° INSIGHT : La moyenne est {diff_pct:.1f}% {'sup√©rieure' if diff_pct > 0 else 'inf√©rieure'} √† la m√©diane")
if diff_pct > 20:
    print("   ‚Üí Distribution asym√©trique positive (skewed right) : pr√©sence de valeurs extr√™mes √©lev√©es")
elif diff_pct < -20:
    print("   ‚Üí Distribution asym√©trique n√©gative (skewed left) : pr√©sence de valeurs extr√™mes faibles")
else:
    print("   ‚Üí Distribution relativement sym√©trique")

# ============================================================================
# 2. M√âDIANE PAR CAT√âGORIE (Top 20)
# ============================================================================
print("\n" + "="*80)
print("2Ô∏è‚É£ M√âDIANE PAR CAT√âGORIE (Top 20)")
print("="*80)

# Utiliser les donn√©es d√©j√† calcul√©es dans category_stats
median_by_category = category_stats[['category_name', 'total_srs', 'mean_hours', 'median', 
                                      'q25', 'q75', 'iqr']].head(20).copy()
median_by_category['mean_median_diff'] = median_by_category['mean_hours'] - median_by_category['median']
median_by_category['mean_median_ratio'] = (median_by_category['mean_hours'] / 
                                           median_by_category['median']).round(2)

print("\nüìä Top 20 Cat√©gories par Volume\n")
display(median_by_category)

# Cat√©gories avec la plus grande diff√©rence moyenne-m√©diane
print("\nüéØ Cat√©gories avec plus grande asym√©trie (Moyenne >> M√©diane) :")
print("   ‚Üí Indique pr√©sence de cas extr√™mement longs\n")

high_skew = median_by_category.sort_values('mean_median_ratio', ascending=False).head(5)
for _, row in high_skew.iterrows():
    print(f"   ‚Ä¢ {row['category_name'][:40]:<40} : Ratio = {row['mean_median_ratio']:.2f}x")
    print(f"     M√©diane = {row['median']:.1f}h | Moyenne = {row['mean_hours']:.1f}h")

# ============================================================================
# 3. M√âDIANE PAR DESK (Top 20)
# ============================================================================
print("\n" + "="*80)
print("3Ô∏è‚É£ M√âDIANE PAR DESK (Top 20)")
print("="*80)

# R√©cup√©rer les desks avec volume > 100
desk_list = pd.read_sql_query("""
SELECT 
    JUR_DESK_ID as desk_id,
    COUNT(*) as total_srs,
    ROUND(AVG(CAST((julianday(CLOSINGDATE) - julianday(CREATIONDATE)) * 24 AS REAL)), 2) as mean_hours
FROM sr
WHERE CLOSINGDATE IS NOT NULL
  AND CREATIONDATE IS NOT NULL
  AND JUR_DESK_ID IS NOT NULL
  AND strftime('%Y-%m', CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
GROUP BY JUR_DESK_ID
HAVING COUNT(*) >= 100
ORDER BY COUNT(*) DESC
LIMIT 20
""", conn)

print("\n‚è≥ Calcul des m√©dianes par desk...")

median_by_desk_data = []

for idx, row in desk_list.iterrows():
    desk_id = row['desk_id']
    
    # R√©cup√©rer les temps pour ce desk
    times_query = f"""
    SELECT CAST((julianday(CLOSINGDATE) - julianday(CREATIONDATE)) * 24 AS REAL) as hours
    FROM sr
    WHERE JUR_DESK_ID = {desk_id}
      AND CLOSINGDATE IS NOT NULL
      AND CREATIONDATE IS NOT NULL
      AND strftime('%Y-%m', CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
    """
    
    times_df = pd.read_sql_query(times_query, conn)
    times = times_df['hours'].values
    
    if len(times) > 0:
        median_by_desk_data.append({
            'desk_id': desk_id,
            'total_srs': len(times),
            'mean_hours': round(np.mean(times), 2),
            'median_hours': round(np.percentile(times, 50), 2),
            'q25_hours': round(np.percentile(times, 25), 2),
            'q75_hours': round(np.percentile(times, 75), 2)
        })

median_by_desk = pd.DataFrame(median_by_desk_data)
median_by_desk['mean_median_ratio'] = (median_by_desk['mean_hours'] / 
                                       median_by_desk['median_hours']).round(2)
median_by_desk['iqr'] = median_by_desk['q75_hours'] - median_by_desk['q25_hours']

print("\nüìä Top 20 Desks par Volume\n")
display(median_by_desk)

# ============================================================================
# 4. M√âDIANE PAR ACTIVIT√â (si activit√©s pr√©sentes)
# ============================================================================
print("\n" + "="*80)
print("4Ô∏è‚É£ M√âDIANE DES ACTIVIT√âS")
print("="*80)

print("\n‚è≥ Calcul des m√©dianes pour les activit√©s...")

activity_times = pd.read_sql_query("""
SELECT CAST((julianday(CLOSINGDATE) - julianday(CREATIONDATE)) * 24 AS REAL) as hours
FROM activity
WHERE CLOSINGDATE IS NOT NULL
  AND CREATIONDATE IS NOT NULL
  AND strftime('%Y-%m', CREATIONDATE) BETWEEN '2024-01' AND '2025-09'
""", conn)

act_times = activity_times['hours'].values

median_activity = pd.DataFrame([{
    'total_activities': len(act_times),
    'mean_hours': round(np.mean(act_times), 2),
    'median_hours': round(np.percentile(act_times, 50), 2),
    'p95_hours': round(np.percentile(act_times, 95), 2)
}])

print("\nüìä TEMPS DE TRAITEMENT DES ACTIVIT√âS\n")
print(f"Total activit√©s : {median_activity['total_activities'].values[0]:,}")
print(f"   ‚Ä¢ Moyenne       : {median_activity['mean_hours'].values[0]:.2f}h ({median_activity['mean_hours'].values[0]/24:.2f}j)")
print(f"   ‚Ä¢ M√©diane (P50) : {median_activity['median_hours'].values[0]:.2f}h ({median_activity['median_hours'].values[0]/24:.2f}j)")
print(f"   ‚Ä¢ P95           : {median_activity['p95_hours'].values[0]:.2f}h ({median_activity['p95_hours'].values[0]/24:.2f}j)")

display(median_activity)

# ============================================================================
# 5. VISUALISATIONS
# ============================================================================
print("\n" + "="*80)
print("üìà VISUALISATIONS - COMPARAISON MOYENNE VS M√âDIANE")
print("="*80)

# Graphique 1 : Comparaison Moyenne vs M√©diane par Cat√©gorie
fig1 = go.Figure()

top20_cat = median_by_category.head(20)

fig1.add_trace(go.Bar(
    y=top20_cat['category_name'],
    x=top20_cat['mean_hours'],
    name='Moyenne',
    orientation='h',
    marker_color='lightblue',
    text=top20_cat['mean_hours'].apply(lambda x: f"{x:.1f}h"),
    textposition='outside'
))

fig1.add_trace(go.Bar(
    y=top20_cat['category_name'],
    x=top20_cat['median'],
    name='M√©diane',
    orientation='h',
    marker_color='coral',
    text=top20_cat['median'].apply(lambda x: f"{x:.1f}h"),
    textposition='inside'
))

fig1.update_layout(
    title='Moyenne vs M√©diane du Temps de Traitement (Top 20 Cat√©gories)',
    xaxis_title='Temps (heures)',
    yaxis_title='Cat√©gorie',
    height=800,
    barmode='overlay',
    legend=dict(x=0.7, y=0.95)
)

fig1.show()

# Graphique 2 : Box plot global montrant la distribution
fig2 = go.Figure()

# Utiliser un √©chantillon pour visualisation
sample_times = np.random.choice(times_array, min(10000, len(times_array)), replace=False)

fig2.add_trace(go.Box(
    y=sample_times,
    name='Temps de traitement',
    marker_color='steelblue',
    boxmean='sd'  # Afficher moyenne et √©cart type
))

fig2.update_layout(
    title='Distribution Globale du Temps de Traitement (√âchantillon 10K SRs)',
    yaxis_title='Temps (heures)',
    yaxis_type='log',  # √âchelle log pour mieux voir
    height=600,
    showlegend=False
)

# Ajouter annotations pour m√©diane et moyenne
fig2.add_hline(y=global_stats['median_hours'], 
               line_dash="dash", line_color="red",
               annotation_text=f"M√©diane = {global_stats['median_hours']:.1f}h",
               annotation_position="right")

fig2.add_hline(y=global_stats['mean_hours'], 
               line_dash="dot", line_color="blue",
               annotation_text=f"Moyenne = {global_stats['mean_hours']:.1f}h",
               annotation_position="right")

fig2.show()

# Graphique 3 : Ratio Moyenne/M√©diane par cat√©gorie
fig3 = px.bar(
    median_by_category.sort_values('mean_median_ratio', ascending=True).head(20),
    x='mean_median_ratio',
    y='category_name',
    orientation='h',
    title='Ratio Moyenne/M√©diane par Cat√©gorie (Top 20)',
    labels={'mean_median_ratio': 'Ratio Moyenne/M√©diane', 'category_name': 'Cat√©gorie'},
    color='mean_median_ratio',
    color_continuous_scale='RdYlGn_r',
    text='mean_median_ratio'
)

fig3.add_vline(x=1, line_dash="dash", line_color="black",
               annotation_text="Ratio = 1 (sym√©trique)", annotation_position="top")

fig3.update_traces(texttemplate='%{text:.2f}x', textposition='outside')
fig3.update_layout(height=700)
fig3.show()

print("\n‚úÖ Cellule 3 termin√©e : Analyses de m√©diane compl√©t√©es")
print("\n" + "="*80)
print("üéâ ANALYSE COMPL√àTE TERMIN√âE")
print("="*80)
print("\nR√©sum√© des 3 cellules :")
print("  1Ô∏è‚É£ √âcarts types et statistiques descriptives par cat√©gorie/desk")
print("  2Ô∏è‚É£ Tests de normalit√© et ajustement de distributions")
print("  3Ô∏è‚É£ M√©dianes et percentiles par dimension")
print("\nüìä Toutes les analyses statistiques ont √©t√© effectu√©es avec succ√®s !")

üìä ANALYSE DE LA M√âDIANE DU TEMPS DE TRAITEMENT

1Ô∏è‚É£ M√âDIANE GLOBALE DU TEMPS DE TRAITEMENT
‚è≥ Calcul des percentiles globaux...

üìä STATISTIQUES GLOBALES (Tous SRs ferm√©s)

Total SRs analys√©s : 4,116,982

üéØ TEMPS DE TRAITEMENT :
   ‚Ä¢ Moyenne        : 143.45h (5.98j)
   ‚Ä¢ M√©diane (P50)  : 2.01h (0.08j)

üìà QUARTILES :
   ‚Ä¢ Q1 (P25)       : 0.20h (0.01j)
   ‚Ä¢ Q3 (P75)       : 44.52h (1.86j)
   ‚Ä¢ IQR            : 44.32h

üîù PERCENTILES √âLEV√âS :
   ‚Ä¢ P90            : 247.88h (10.33j)
   ‚Ä¢ P95            : 667.91h (27.83j)
   ‚Ä¢ P99            : 2879.53h (119.98j)


Unnamed: 0,total_srs,mean_hours,median_hours,q25_hours,q75_hours,p90_hours,p95_hours,p99_hours
0,4116982,143.45,2.01,0.2,44.52,247.88,667.91,2879.53



üí° INSIGHT : La moyenne est 7036.8% sup√©rieure √† la m√©diane
   ‚Üí Distribution asym√©trique positive (skewed right) : pr√©sence de valeurs extr√™mes √©lev√©es

2Ô∏è‚É£ M√âDIANE PAR CAT√âGORIE (Top 20)

üìä Top 20 Cat√©gories par Volume



Unnamed: 0,category_name,total_srs,mean_hours,median,q25,q75,iqr,mean_median_diff,mean_median_ratio
0,Tax,100680,457.93,69.47,3.05,330.26,327.21,388.46,6.59
1,Cash instruction,91822,53.72,1.01,0.12,17.46,17.34,52.71,53.19
2,BAU Asset Creation,82171,21.78,1.58,0.33,4.38,4.05,20.2,13.78
3,CREST,76847,28.73,0.47,0.11,2.05,1.94,28.26,61.13
4,Others,75748,72.0,1.01,0.08,17.9,17.82,70.99,71.29
5,Investigation Level 1,71499,65.01,1.26,0.12,17.99,17.87,63.75,51.6
6,Settlement,64965,116.3,17.88,0.85,73.8,72.95,98.42,6.5
7,FRPP,63229,65.41,1.07,0.17,17.8,17.63,64.34,61.13
8,ESMX,60837,26.19,0.49,0.08,2.72,2.64,25.7,53.45
9,DEFF,58446,38.88,0.73,0.12,5.73,5.61,38.15,53.26



üéØ Cat√©gories avec plus grande asym√©trie (Moyenne >> M√©diane) :
   ‚Üí Indique pr√©sence de cas extr√™mement longs

   ‚Ä¢ Others                                   : Ratio = 71.29x
     M√©diane = 1.0h | Moyenne = 72.0h
   ‚Ä¢ CREST                                    : Ratio = 61.13x
     M√©diane = 0.5h | Moyenne = 28.7h
   ‚Ä¢ FRPP                                     : Ratio = 61.13x
     M√©diane = 1.1h | Moyenne = 65.4h
   ‚Ä¢ ESMX                                     : Ratio = 53.45x
     M√©diane = 0.5h | Moyenne = 26.2h
   ‚Ä¢ DEFF                                     : Ratio = 53.26x
     M√©diane = 0.7h | Moyenne = 38.9h

3Ô∏è‚É£ M√âDIANE PAR DESK (Top 20)

‚è≥ Calcul des m√©dianes par desk...

üìä Top 20 Desks par Volume



Unnamed: 0,desk_id,total_srs,mean_hours,median_hours,q25_hours,q75_hours,mean_median_ratio,iqr
0,73044.0,131442,9.14,0.35,0.13,0.85,26.11,0.72
1,73049.0,82505,24.05,1.61,0.34,4.44,14.94,4.1
2,73246.0,73101,27.64,0.48,0.12,2.05,57.58,1.93
3,73046.0,64025,3.83,0.22,0.09,0.47,17.41,0.38
4,73059.0,60202,24.98,0.36,0.06,2.96,69.39,2.9
5,72956.0,50130,121.34,6.38,0.78,71.53,19.02,70.75
6,83807.0,49647,102.49,4.87,0.66,65.68,21.05,65.02
7,77933.0,48009,447.59,48.72,2.25,332.72,9.19,330.47
8,73203.0,45747,70.84,5.35,1.39,25.31,13.24,23.92
9,73195.0,45634,36.29,0.76,0.07,4.36,47.75,4.29



4Ô∏è‚É£ M√âDIANE DES ACTIVIT√âS

‚è≥ Calcul des m√©dianes pour les activit√©s...

üìä TEMPS DE TRAITEMENT DES ACTIVIT√âS

Total activit√©s : 338,158
   ‚Ä¢ Moyenne       : 42.11h (1.75j)
   ‚Ä¢ M√©diane (P50) : 0.41h (0.02j)
   ‚Ä¢ P95           : 168.37h (7.02j)


Unnamed: 0,total_activities,mean_hours,median_hours,p95_hours
0,338158,42.11,0.41,168.37



üìà VISUALISATIONS - COMPARAISON MOYENNE VS M√âDIANE



‚úÖ Cellule 3 termin√©e : Analyses de m√©diane compl√©t√©es

üéâ ANALYSE COMPL√àTE TERMIN√âE

R√©sum√© des 3 cellules :
  1Ô∏è‚É£ √âcarts types et statistiques descriptives par cat√©gorie/desk
  2Ô∏è‚É£ Tests de normalit√© et ajustement de distributions
  3Ô∏è‚É£ M√©dianes et percentiles par dimension

üìä Toutes les analyses statistiques ont √©t√© effectu√©es avec succ√®s !
