# BNP Paribas Securities Services - Exploratory Data Analysis

**P√©riode** : Janvier 2024 - Septembre 2025

---
## 1. Configuration & Connexion √† la Base de Donn√©es

In [1]:
import pandas as pd
import numpy as np
import sqlite3
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from pathlib import Path
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Configuration Plotly
import plotly.io as pio
pio.templates.default = "plotly_white"

# Configuration des chemins
BASE_DIR = Path.home() / 'Desktop/BNP Paribas'
DB_PATH = BASE_DIR / 'Data/Processed/hobart_database.db'

# Connexion √† la base
conn = sqlite3.connect(str(DB_PATH))

# V√©rifier la connexion
tables_count = pd.read_sql_query("SELECT COUNT(*) as count FROM sqlite_master WHERE type='table';", conn)['count'][0]
views_count = pd.read_sql_query("SELECT COUNT(*) as count FROM sqlite_master WHERE type='view';", conn)['count'][0]

print("="*70)
print("üìä CONNEXION √Ä LA BASE DE DONN√âES HOBART")
print("="*70)
print(f"\n‚úÖ Connexion r√©ussie")
print(f"üìÇ Emplacement: {DB_PATH}")
print(f"üìã {tables_count} tables disponibles")
print(f"üëÅÔ∏è  {views_count} vues cr√©√©es\n")

üìä CONNEXION √Ä LA BASE DE DONN√âES HOBART

‚úÖ Connexion r√©ussie
üìÇ Emplacement: /Users/jo/Desktop/BNP Paribas/Data/Processed/hobart_database.db
üìã 14 tables disponibles
üëÅÔ∏è  8 vues cr√©√©es



---
## 2. Structure de la Base de Donn√©es

### Tables et Vues Disponibles

In [2]:
# Lister les tables
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;", conn)

print("="*70)
print("üìã TABLES PRINCIPALES")
print("="*70)
for table in tables['name'].values:
    if table != 'sqlite_stat1':  # Ignorer la table syst√®me
        print(f"  ‚Ä¢ {table}")

print("\n" + "="*70)

# Lister les vues
views = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='view' ORDER BY name;", conn)

print("üëÅÔ∏è  VUES OPTIMIS√âES")
print("="*70)
for view in views['name'].values:
    print(f"  ‚Ä¢ {view}")
    
print("\n")

üìã TABLES PRINCIPALES
  ‚Ä¢ activity
  ‚Ä¢ businessline
  ‚Ä¢ businesslineactivity
  ‚Ä¢ businesslineprocess
  ‚Ä¢ category
  ‚Ä¢ deskbusinesslinelink
  ‚Ä¢ history_activity
  ‚Ä¢ history_communication
  ‚Ä¢ history_sr
  ‚Ä¢ jur_user
  ‚Ä¢ label
  ‚Ä¢ sr
  ‚Ä¢ srcontact

üëÅÔ∏è  VUES OPTIMIS√âES
  ‚Ä¢ activity_history_view
  ‚Ä¢ complete_view
  ‚Ä¢ contact_effectiveness_metrics
  ‚Ä¢ monthly_desk_metrics
  ‚Ä¢ sr_activity_view
  ‚Ä¢ sr_history_view
  ‚Ä¢ sr_lifecycle_summary
  ‚Ä¢ weekly_sr_distribution




---
## 3. Vue d'Ensemble des Donn√©es

### Statistiques Globales

In [12]:
# Statistiques globales
stats = pd.read_sql_query("""
SELECT 
    (SELECT COUNT(*) FROM sr) as total_srs,
    (SELECT COUNT(*) FROM activity) as total_activities,
    (SELECT COUNT(*) FROM srcontact) as total_contacts,
    (SELECT COUNT(*) FROM history_sr) as total_sr_history,
    (SELECT COUNT(DISTINCT CATEGORY_ID) FROM sr) as unique_categories,
    (SELECT COUNT(DISTINCT JUR_DESK_ID) FROM sr) as unique_desks
""", conn)

print("="*70)
print("üìä STATISTIQUES GLOBALES DE LA BASE")
print("="*70)
print(f"\nüìã Service Requests (SRs) : {stats['total_srs'][0]:,}")
print(f"üéØ Activit√©s : {stats['total_activities'][0]:,}")
print(f"üìû Communications (Contacts) : {stats['total_contacts'][0]:,}")
print(f"üìú Historique SR (lignes) : {stats['total_sr_history'][0]:,}")
print(f"\nüìÇ Cat√©gories uniques : {stats['unique_categories'][0]:,}")
print(f"üè¢ Desks actifs : {stats['unique_desks'][0]:,}")
print("\n")

üìä STATISTIQUES GLOBALES DE LA BASE

üìã Service Requests (SRs) : 4,233,963
üéØ Activit√©s : 348,101
üìû Communications (Contacts) : 11,903,551
üìú Historique SR (lignes) : 25,587,996

üìÇ Cat√©gories uniques : 1,548
üè¢ Desks actifs : 761




---
## 4. P√©riode Temporelle Couverte

In [13]:
# P√©riode temporelle
date_range = pd.read_sql_query("""
SELECT 
    MIN(CREATIONDATE) as first_sr,
    MAX(CREATIONDATE) as last_sr,
    COUNT(DISTINCT strftime('%Y-%m', CREATIONDATE)) as unique_months
FROM sr
WHERE CREATIONDATE IS NOT NULL
""", conn)

first_date = pd.to_datetime(date_range['first_sr'][0])
last_date = pd.to_datetime(date_range['last_sr'][0])
months_span = date_range['unique_months'][0]

print("="*70)
print("üìÖ P√âRIODE TEMPORELLE")
print("="*70)
print(f"\nüóìÔ∏è  Premier SR : {first_date.strftime('%d %B %Y')}")
print(f"üóìÔ∏è  Dernier SR : {last_date.strftime('%d %B %Y')}")
print(f"üìä Mois couverts : {months_span} mois")
print(f"‚è±Ô∏è  Dur√©e totale : {(last_date - first_date).days} jours\n")

üìÖ P√âRIODE TEMPORELLE

üóìÔ∏è  Premier SR : 01 January 2024
üóìÔ∏è  Dernier SR : 12 October 2025
üìä Mois couverts : 22 mois
‚è±Ô∏è  Dur√©e totale : 650 jours



---
## 5. Distribution des Volumes par Table

In [14]:
# Volumes par table
table_volumes = []
main_tables = ['sr', 'activity', 'srcontact', 'history_sr', 'history_activity', 'history_communication']

for table in main_tables:
    try:
        count = pd.read_sql_query(f"SELECT COUNT(*) as count FROM {table}", conn)['count'][0]
        table_volumes.append({'Table': table.upper(), 'Lignes': count})
    except:
        pass

df_volumes = pd.DataFrame(table_volumes).sort_values('Lignes', ascending=False)

print("="*70)
print("üìä VOLUME DE DONN√âES PAR TABLE")
print("="*70)
print()
display(df_volumes)

# Visualisation
fig = px.bar(
    df_volumes, 
    x='Table', 
    y='Lignes',
    title='Volume de Lignes par Table Principale',
    text='Lignes',
    color='Lignes',
    color_continuous_scale='Blues'
)

fig.update_traces(texttemplate='%{text:,.0f}', textposition='outside')
fig.update_layout(height=500, showlegend=False)
fig.show()

üìä VOLUME DE DONN√âES PAR TABLE



Unnamed: 0,Table,Lignes
5,HISTORY_COMMUNICATION,29115672
3,HISTORY_SR,25587996
2,SRCONTACT,11903551
0,SR,4233963
4,HISTORY_ACTIVITY,1810203
1,ACTIVITY,348101


---
## 6. Structure des Tables Principales

### Inspection des Colonnes et Types de Donn√©es

In [15]:
# Fonction pour afficher la structure d'une table
def show_table_structure(table_name):
    schema = pd.read_sql_query(f"PRAGMA table_info({table_name})", conn)
    
    print(f"\n{'='*70}")
    print(f"üìã TABLE: {table_name.upper()}")
    print(f"{'='*70}\n")
    
    # Afficher les colonnes
    for idx, row in schema.iterrows():
        col_name = row['name']
        col_type = row['type']
        pk = " üîë PRIMARY KEY" if row['pk'] == 1 else ""
        notnull = " ‚ö†Ô∏è NOT NULL" if row['notnull'] == 1 else ""
        
        print(f"  ‚Ä¢ {col_name:<30} {col_type:<15} {pk}{notnull}")
    
    # Compter les lignes
    count = pd.read_sql_query(f"SELECT COUNT(*) as count FROM {table_name}", conn)['count'][0]
    print(f"\n  üìä Total de lignes : {count:,}\n")

# Afficher la structure des tables principales
for table in ['sr', 'activity', 'srcontact']:
    show_table_structure(table)


üìã TABLE: SR

  ‚Ä¢ ID                             INTEGER         
  ‚Ä¢ SRNUMBER                       TEXT            
  ‚Ä¢ CATEGORY_ID                    INTEGER         
  ‚Ä¢ SUBCATEGORY_ID                 INTEGER         
  ‚Ä¢ JUR_ASSIGNEE_ID                REAL            
  ‚Ä¢ STATUS_ID                      INTEGER         
  ‚Ä¢ EXPECTED_FIRST_RESPONSE_DATE   TIMESTAMP       
  ‚Ä¢ EXPECTED_ACKNOWLEDGEMENT_DATE  TIMESTAMP       
  ‚Ä¢ EXPIRATION_DATE                TIMESTAMP       
  ‚Ä¢ CLOSINGDATE                    TIMESTAMP       
  ‚Ä¢ JUR_DESK_ID                    INTEGER         
  ‚Ä¢ CREATIONDATE                   TIMESTAMP       
  ‚Ä¢ PRIORITY_ID                    INTEGER         
  ‚Ä¢ ROOTSR_ID                      REAL            
  ‚Ä¢ TYPE_ID                        INTEGER         
  ‚Ä¢ QUICK_FULFILLMENT_ID           REAL            
  ‚Ä¢ QUICK_ANSWER                   INTEGER         
  ‚Ä¢ DEMAND_DATE                    TIMESTAMP       
  ‚Ä¢ ACKNO

---
## 7. √âchantillons de Donn√©es

### Aper√ßu des Premi√®res Lignes

In [16]:
# √âchantillon de la table SR
print("="*70)
print("üìã √âCHANTILLON: TABLE SR (5 premi√®res lignes)")
print("="*70)
print()

sample_sr = pd.read_sql_query("""
SELECT 
    ID,
    SRNUMBER,
    CATEGORY_ID,
    STATUS_ID,
    PRIORITY_ID,
    CREATIONDATE,
    CLOSINGDATE,
    JUR_DESK_ID
FROM sr 
LIMIT 5
""", conn)

display(sample_sr)

üìã √âCHANTILLON: TABLE SR (5 premi√®res lignes)



Unnamed: 0,ID,SRNUMBER,CATEGORY_ID,STATUS_ID,PRIORITY_ID,CREATIONDATE,CLOSINGDATE,JUR_DESK_ID
0,15441799,[PROXYHUB-CES-9419],158264,7,1719,2024-01-02 17:42:56.267000,2024-01-11 11:12:40.252000,71958
1,15468648,[PROXYHUB-VPR-9427],154238,7,1753,2024-01-04 10:18:49.287000,2024-01-09 09:43:16.011000,71958
2,15572526,[PROXYHUB-CES-9460],158236,7,1719,2024-01-11 18:17:15.359000,2024-02-05 17:02:47.877000,71958
3,15603967,[PROXYHUB-VDA-9474],462499,7,1719,2024-01-15 14:22:56.895000,2024-01-16 12:41:44.333000,71958
4,15631929,[PROXYHUB-VPR-9482],462499,7,1719,2024-01-17 08:31:20.368000,2024-01-18 09:15:05.764000,71958


In [17]:
# √âchantillon de la table ACTIVITY
print("\n" + "="*70)
print("üéØ √âCHANTILLON: TABLE ACTIVITY (5 premi√®res lignes)")
print("="*70)
print()

sample_activity = pd.read_sql_query("""
SELECT 
    ID,
    SR_ID,
    TYPE_ID,
    STATUS_ID,
    CREATIONDATE,
    CLOSINGDATE
FROM activity 
LIMIT 5
""", conn)

display(sample_activity)


üéØ √âCHANTILLON: TABLE ACTIVITY (5 premi√®res lignes)



Unnamed: 0,ID,SR_ID,TYPE_ID,STATUS_ID,CREATIONDATE,CLOSINGDATE
0,1910138,15619980,1579,31,2024-01-16 17:59:19.998000,2024-01-16 17:59:27.399000
1,1912489,15584731,1579,31,2024-01-17 17:15:52.764000,2024-01-17 17:20:35.362000
2,1921705,15702601,1579,31,2024-01-23 15:31:16.632000,2024-01-23 15:34:15.466000
3,1925377,15751583,1579,31,2024-01-25 11:01:03.873000,2024-01-31 08:18:24.108000
4,1925164,15748396,1579,31,2024-01-25 10:16:43.158000,2024-01-25 11:19:36.523000


In [18]:
# Fermer la connexion
conn.close()
print("‚úÖ Connexion ferm√©e")

‚úÖ Connexion ferm√©e
