In [8]:
import pandas as pd
import numpy as np
import sqlite3
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path

# Connexion
DB_PATH = Path.home() / 'Desktop/BNP Paribas/Data/Processed/hobart_database.db'
conn = sqlite3.connect(str(DB_PATH))

df = pd.read_sql_query("""
SELECT 
    c.NAME as category,
    sr.CREATIONDATE,
    sr.CLOSINGDATE,
    sr.EXPIRATION_DATE
FROM sr
JOIN category c ON sr.CATEGORY_ID = c.ID
WHERE sr.CLOSINGDATE IS NOT NULL 
  AND sr.CREATIONDATE IS NOT NULL
""", conn)
conn.close()

# Conversion dates
df['CREATIONDATE'] = pd.to_datetime(df['CREATIONDATE'], format='mixed')
df['CLOSINGDATE'] = pd.to_datetime(df['CLOSINGDATE'], format='mixed')
df['EXPIRATION_DATE'] = pd.to_datetime(df['EXPIRATION_DATE'], format="mixed", errors='coerce')

# Temps de r√©solution en jours
df['resolution_days'] = (df['CLOSINGDATE'] - df['CREATIONDATE']).dt.total_seconds() / 86400
df['deadline_days'] = (df['EXPIRATION_DATE'] - df['CREATIONDATE']).dt.total_seconds() / 86400

# Filtrer valeurs aberrantes
df = df[(df['resolution_days'] >= 0) & (df['resolution_days'] <= 365)]

# Stats par cat√©gorie (min 1000 SRs pour TOP/FLOP, min 100 pour volume)
cat_stats = df.groupby('category').agg(
    count=('resolution_days', 'size'),
    avg_resolution=('resolution_days', 'mean'),
    avg_deadline=('deadline_days', 'mean')
)

cat_stats_1k = cat_stats.query('count >= 1000')

# TOP 10 par rapidit√©, FLOP 10 par lenteur (min 1000 SRs), TOP 10 par volume (min 100)
top10_speed = cat_stats_1k.sort_values('avg_resolution').head(10)
flop10 = cat_stats_1k.sort_values('avg_resolution').tail(10).sort_values('avg_resolution', ascending=False)
top10_volume = cat_stats.query('count >= 100').sort_values('count', ascending=False).head(10)

# ‚îÄ‚îÄ‚îÄ Bins pour les histogrammes ‚îÄ‚îÄ‚îÄ
all_bin_edges = np.array([0, 0.5, 1, 2, 3, 5, 7, 14, 30, 60, 90, 180, 365])
all_bin_labels = ['<12h', '12h-1j', '1-2j', '2-3j', '3-5j', '5-7j', '7-14j', '14-30j', '30-60j', '60-90j', '90-180j', '180-365j']


def make_category_grid(cat_list, title, color_bars, color_deadline, max_days=365, min_days=0):
    """Cr√©e une grille 2x5 de graphiques pour 10 cat√©gories."""
    # Tronquer les bins selon max_days et min_days
    start = np.searchsorted(all_bin_edges, min_days, side='right')
    if start > 0:
        start -= 1
    cut = np.searchsorted(all_bin_edges, max_days, side='right')
    edges = all_bin_edges[start:cut]
    if edges[-1] < max_days:
        edges = np.append(edges, max_days)
    labels = all_bin_labels[start:start + len(edges) - 1]

    rows, cols = 2, 5

    fig = make_subplots(
        rows=rows, cols=cols,
        subplot_titles=[f"<b>{cat}</b>" for cat in cat_list.index],
        horizontal_spacing=0.05,
        vertical_spacing=0.12
    )

    for idx, (cat, row) in enumerate(cat_list.iterrows()):
        r = idx // cols + 1
        c = idx % cols + 1

        cat_data = df[df['category'] == cat]['resolution_days']
        counts, _ = np.histogram(cat_data, bins=edges)

        # Barres : distribution du temps de r√©solution
        fig.add_trace(go.Bar(
            x=labels,
            y=counts,
            marker_color=color_bars,
            opacity=0.85,
            name='SRs',
            showlegend=(idx == 0),
            hovertemplate='<b>%{x}</b><br>SRs: %{y:,}<extra></extra>'
        ), row=r, col=c)

        # Ligne verticale deadline moyenne
        deadline = row['avg_deadline']
        if not np.isnan(deadline) and min_days < deadline <= max_days:
            deadline_bin_idx = np.searchsorted(edges, deadline, side='right') - 1
            deadline_bin_idx = min(deadline_bin_idx, len(labels) - 1)
            deadline_label = labels[deadline_bin_idx]

            y_max = counts.max() if counts.max() > 0 else 1

            fig.add_vline(
                x=deadline_label, row=r, col=c,
                line=dict(color=color_deadline, width=2, dash='dash'),
            )
            fig.add_annotation(
                x=deadline_label, y=y_max * 0.95,
                text=f"Deadline<br>{deadline:.1f}j",
                showarrow=False,
                font=dict(size=8, color=color_deadline),
                row=r, col=c
            )

        # Annotation stats
        axis_suffix = "" if idx == 0 else str(idx + 1)
        pct_filtered = (cat_data >= min_days).sum() / len(cat_data) * 100 if len(cat_data) > 0 else 0
        note_filter = f"<br>({100 - pct_filtered:.0f}% < 12h)" if min_days > 0 else ""
        fig.add_annotation(
            x=0.95, y=0.85,
            xref=f"x{axis_suffix} domain", yref=f"y{axis_suffix} domain",
            text=f"n={row['count']:,.0f}<br>Moy: {row['avg_resolution']:.1f}j{note_filter}",
            showarrow=False,
            font=dict(size=8, color='#555'),
            align='right',
            xanchor='right'
        )

        # Axes
        fig.update_xaxes(tickangle=45, tickfont=dict(size=7), row=r, col=c)
        fig.update_yaxes(tickfont=dict(size=7), row=r, col=c)

    fig.update_layout(
        title=dict(text=title, font=dict(size=18)),
        height=600,
        width=1400,
        showlegend=False,
        plot_bgcolor='white',
        margin=dict(t=100, b=50, l=50, r=30)
    )
    fig.update_xaxes(showgrid=False)
    fig.update_yaxes(showgrid=True, gridcolor='#ecf0f1')

    return fig


# ‚îÄ‚îÄ‚îÄ TOP 10 rapidit√© : filtre >= 12h, √©chelle 30j ‚îÄ‚îÄ‚îÄ
fig_top = make_category_grid(
    top10_speed,
    "üèÜ <b>TOP 10 ‚Äî Cat√©gories les plus rapides (‚â• 1 000 SRs)</b><br>"
    "<sup>Distribution hors &lt;12h (‚â§ 30j) | Ligne pointill√©e = deadline moyenne</sup>",
    color_bars='#2ecc71',
    color_deadline='#e74c3c',
    max_days=30,
    min_days=0.5
)
fig_top.show()

# ‚îÄ‚îÄ‚îÄ TOP 10 volume ‚îÄ‚îÄ‚îÄ
fig_vol = make_category_grid(
    top10_volume,
    "üìä <b>TOP 10 ‚Äî Cat√©gories avec le plus de SRs</b><br>"
    "<sup>Distribution du temps de r√©solution | Ligne pointill√©e = deadline moyenne</sup>",
    color_bars='#3498db',
    color_deadline='#e74c3c',
    max_days=365
)
fig_vol.show()

# ‚îÄ‚îÄ‚îÄ FLOP 10 ‚îÄ‚îÄ‚îÄ
fig_flop = make_category_grid(
    flop10,
    "‚ö†Ô∏è <b>FLOP 10 ‚Äî Cat√©gories les plus lentes (‚â• 1 000 SRs)</b><br>"
    "<sup>Distribution du temps de r√©solution | Ligne pointill√©e = deadline moyenne</sup>",
    color_bars='#e74c3c',
    color_deadline='#2ecc71',
    max_days=365
)
fig_flop.show()

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path

# ‚îÄ‚îÄ‚îÄ Connexion & Requ√™te ‚îÄ‚îÄ‚îÄ
DB_PATH = Path.home() / 'Desktop/BNP Paribas/Data/Processed/hobart_database.db'
conn = sqlite3.connect(str(DB_PATH))

df = pd.read_sql_query("""
SELECT 
    sr.ID as sr_id,
    sr.CREATIONDATE,
    sr.CLOSINGDATE,
    COUNT(a.ID) as nb_activities
FROM sr
LEFT JOIN activity a ON a.SR_ID = sr.ID
WHERE sr.CLOSINGDATE IS NOT NULL 
  AND sr.CREATIONDATE IS NOT NULL
GROUP BY sr.ID, sr.CREATIONDATE, sr.CLOSINGDATE
""", conn)
conn.close()

# Conversion dates & calcul temps de r√©solution en heures
df['CREATIONDATE'] = pd.to_datetime(df['CREATIONDATE'], format='mixed')
df['CLOSINGDATE'] = pd.to_datetime(df['CLOSINGDATE'], format='mixed')
df['resolution_hours'] = (df['CLOSINGDATE'] - df['CREATIONDATE']).dt.total_seconds() / 3600

# Filtrer valeurs aberrantes (r√©solution entre 0 et 365 jours)
df = df[(df['resolution_hours'] >= 0) & (df['resolution_hours'] <= 365 * 24)]

# ‚îÄ‚îÄ‚îÄ Cat√©gorisation par nombre d'activities ‚îÄ‚îÄ‚îÄ
def categorize_activities(n):
    if n == 0:
        return '0 activity'
    elif n == 1:
        return '1 activity'
    else:
        return '2+ activities'

df['activity_group'] = df['nb_activities'].apply(categorize_activities)

# ‚îÄ‚îÄ‚îÄ Stats par groupe ‚îÄ‚îÄ‚îÄ
group_order = ['0 activity', '1 activity', '2+ activities']
stats = df.groupby('activity_group').agg(
    nb_sr=('sr_id', 'count'),
    total_hours=('resolution_hours', 'sum'),
    avg_hours=('resolution_hours', 'mean'),
    median_hours=('resolution_hours', 'median')
).reindex(group_order)

stats['pct_sr'] = stats['nb_sr'] / stats['nb_sr'].sum() * 100
stats['pct_hours'] = stats['total_hours'] / stats['total_hours'].sum() * 100

print("=" * 70)
print("ANALYSE DES SR PAR NOMBRE D'ACTIVITIES")
print("=" * 70)
print(f"\nTotal SRs analys√©s : {stats['nb_sr'].sum():,.0f}")
print(f"Total heures de r√©solution : {stats['total_hours'].sum():,.0f} h")
print()

for group in group_order:
    row = stats.loc[group]
    print(f"  {group:15s} | {row['nb_sr']:>10,.0f} SRs ({row['pct_sr']:5.1f}%) "
          f"| {row['total_hours']:>12,.0f} h ({row['pct_hours']:5.1f}%) "
          f"| Moy: {row['avg_hours']:>8,.1f} h | M√©d: {row['median_hours']:>8,.1f} h")

# ‚îÄ‚îÄ‚îÄ Graphiques ‚îÄ‚îÄ‚îÄ
colors = ['#3498db', '#2ecc71', '#e74c3c']

fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "pie"}, {"type": "pie"}]],
    subplot_titles=[
        "<b>R√©partition des SRs par nombre d'activities</b>",
        "<b>R√©partition du temps de r√©solution</b>"
    ]
)

# Donut 1 : % de SRs
fig.add_trace(go.Pie(
    labels=group_order,
    values=stats['nb_sr'].values,
    hole=0.5,
    marker_colors=colors,
    textinfo='label+percent',
    textposition='outside',
    texttemplate='<b>%{label}</b><br>%{value:,.0f} SRs<br>(%{percent})',
    hovertemplate='<b>%{label}</b><br>SRs: %{value:,.0f}<br>%{percent}<extra></extra>',
    name='SRs'
), row=1, col=1)

# Donut 2 : % du temps de r√©solution
fig.add_trace(go.Pie(
    labels=group_order,
    values=stats['total_hours'].values,
    hole=0.5,
    marker_colors=colors,
    textinfo='label+percent',
    textposition='outside',
    texttemplate='<b>%{label}</b><br>%{value:,.0f} h<br>(%{percent})',
    hovertemplate='<b>%{label}</b><br>Heures: %{value:,.0f}<br>%{percent}<extra></extra>',
    name='Heures'
), row=1, col=2)

fig.update_layout(
    title=dict(
        text="üìä <b>Analyse des SRs par nombre d'activities</b><br>"
             "<sup>R√©partition en volume (SRs) et en temps de r√©solution (heures)</sup>",
        font=dict(size=18)
    ),
    height=500,
    width=1100,
    showlegend=False,
    plot_bgcolor='white',
    margin=dict(t=120, b=50)
)
fig.show()

# ‚îÄ‚îÄ‚îÄ Bar chart : temps moyen et m√©dian par groupe ‚îÄ‚îÄ‚îÄ
fig2 = go.Figure()

fig2.add_trace(go.Bar(
    x=group_order,
    y=stats['avg_hours'].values,
    name='Moyenne',
    marker_color='#3498db',
    text=[f"{v:,.1f} h" for v in stats['avg_hours'].values],
    textposition='outside',
    hovertemplate='<b>%{x}</b><br>Temps moyen: %{y:,.1f} h<extra></extra>'
))

fig2.add_trace(go.Bar(
    x=group_order,
    y=stats['median_hours'].values,
    name='M√©diane',
    marker_color='#2ecc71',
    text=[f"{v:,.1f} h" for v in stats['median_hours'].values],
    textposition='outside',
    hovertemplate='<b>%{x}</b><br>Temps m√©dian: %{y:,.1f} h<extra></extra>'
))

fig2.update_layout(
    title=dict(
        text="‚è±Ô∏è <b>Temps de r√©solution moyen et m√©dian par groupe d'activities</b><br>"
             "<sup>Comparaison entre SRs avec 0, 1 ou 2+ activities</sup>",
        font=dict(size=16)
    ),
    xaxis_title="Nombre d'activities",
    yaxis_title="Heures",
    barmode='group',
    height=450,
    width=800,
    plot_bgcolor='white',
    legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='right', x=1),
    margin=dict(t=120)
)
fig2.update_yaxes(showgrid=True, gridcolor='#ecf0f1')
fig2.show()

ANALYSE DES SR PAR NOMBRE D'ACTIVITIES

Total SRs analys√©s : 2,311,676
Total heures de r√©solution : 356,982,235 h

  0 activity      |  2,189,170 SRs ( 94.7%) |  318,275,354 h ( 89.2%) | Moy:    145.4 h | M√©d:      2.1 h
  1 activity      |     99,742 SRs (  4.3%) |   21,533,406 h (  6.0%) | Moy:    215.9 h | M√©d:      4.5 h
  2+ activities   |     22,764 SRs (  1.0%) |   17,173,475 h (  4.8%) | Moy:    754.4 h | M√©d:    118.9 h
