## 1. Configuration initiale

In [0]:
import os
from pyspark.sql.functions import year, month, to_date, col

print("="*60)
print("üîß CONFIGURATION INITIALE")
print("="*60)

# Variables d'environnement
storage_account_name = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME")
storage_account_key = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY")

if storage_account_name and storage_account_key:
    # Configuration Spark
    spark.conf.set(
        f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
        storage_account_key
    )
    print("‚úÖ Configuration Spark effectu√©e")
else:
    print("‚ö†Ô∏è  Variables d'environnement non configur√©es")

# Nom de la base de donn√©es
DATABASE_NAME = "eau_potable"

print(f"üì¶ Base de donn√©es : {DATABASE_NAME}")
print("="*60)

## 2. V√©rification des fichiers Parquet dans BRONZE

In [0]:
print("\n" + "="*60)
print("üìÇ V√âRIFICATION DES FICHIERS PARQUET")
print("="*60)

try:
    bronze_files = dbutils.fs.ls("/mnt/bronze")
    
    # Filtrer les fichiers Parquet consolid√©s
    plv_files = sorted([f for f in bronze_files if 'DIS_PLV' in f.name and 'consolidated.parquet' in f.name])
    result_files = sorted([f for f in bronze_files if 'DIS_RESULT' in f.name and 'consolidated.parquet' in f.name])
    
    print(f"\nüì¶ Fichiers DIS_PLV trouv√©s : {len(plv_files)}")
    for file in plv_files:
        size = file.size / (1024*1024)
        year = file.name.split('_')[2]
        print(f"   ‚úÖ {year} : {file.name} ({size:.2f} MB)")
    
    print(f"\nüì¶ Fichiers DIS_RESULT trouv√©s : {len(result_files)}")
    for file in result_files:
        size = file.size / (1024*1024)
        year = file.name.split('_')[2]
        print(f"   ‚úÖ {year} : {file.name} ({size:.2f} MB)")
    
    if not plv_files and not result_files:
        raise Exception("‚ùå Aucun fichier Parquet consolid√© trouv√© dans /mnt/bronze")
    
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")
    raise

print("\n" + "="*60)

## 3. Cr√©ation de la base de donn√©es

In [0]:
print("\n" + "="*60)
print("üóÑÔ∏è  CR√âATION DE LA BASE DE DONN√âES")
print("="*60)

# Cr√©er la base de donn√©es si elle n'existe pas
spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE_NAME}")
print(f"‚úÖ Base de donn√©es '{DATABASE_NAME}' cr√©√©e/v√©rifi√©e")

# D√©finir comme base par d√©faut
spark.sql(f"USE {DATABASE_NAME}")
print(f"‚úÖ Base de donn√©es '{DATABASE_NAME}' activ√©e")

# Afficher les bases existantes
print("\nüìã Bases de donn√©es disponibles :")
spark.sql("SHOW DATABASES").show(truncate=False)

print("="*60)

## 4. Cr√©ation des tables DIS_PLV par ann√©e

In [0]:
print("\n" + "="*60)
print("üìä CR√âATION DE LA TABLE DIS_PLV (TOUTES ANN√âES)")
print("="*60)

from pyspark.sql.functions import lit

try:
    all_plv_dfs = []
    plv_stats = []
    
    # Lire tous les fichiers DIS_PLV et ajouter une colonne "annee"
    for file in plv_files:
        year = file.name.split('_')[2]
        
        print(f"\nüîÑ Lecture : DIS_PLV {year}")
        print(f"   üìÇ Fichier : {file.name}")
        
        # Lire le fichier Parquet
        df = spark.read.parquet(file.path)
        
        # Ajouter la colonne "annee"
        df = df.withColumn("annee", lit(year))
        
        row_count = df.count()
        print(f"   üìä Lignes : {row_count:,}")
        
        all_plv_dfs.append(df)
        plv_stats.append({'year': year, 'rows': row_count})
    
    if all_plv_dfs:
        print(f"\nüîó Fusion de {len(all_plv_dfs)} fichiers DIS_PLV...")
        
        # Fusionner tous les DataFrames
        df_plv_consolidated = all_plv_dfs[0]
        for df in all_plv_dfs[1:]:
            df_plv_consolidated = df_plv_consolidated.union(df)
        
        # Compter le total de lignes
        total_plv_rows = df_plv_consolidated.count()
        print(f"‚úÖ Total de lignes apr√®s fusion : {total_plv_rows:,}")
        
        # Cr√©er la table unique
        table_name = "dis_plv"
        print(f"\nüíæ Cr√©ation de la table '{table_name}'...")
        
        df_plv_consolidated.write.mode("overwrite").saveAsTable(f"{DATABASE_NAME}.{table_name}")
        
        print(f"‚úÖ Table '{DATABASE_NAME}.{table_name}' cr√©√©e avec succ√®s")
        print(f"   üìä Lignes totales : {total_plv_rows:,}")
        print(f"   üìä Colonnes : {len(df_plv_consolidated.columns)}")
        print(f"   üìÖ Ann√©es incluses : {', '.join([s['year'] for s in plv_stats])}")
        
    else:
        print("‚ö†Ô∏è  Aucun fichier DIS_PLV √† traiter")

except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")
    raise

print("\n" + "="*60)

## 5. Cr√©ation des tables DIS_RESULT par ann√©e


In [0]:
print("\n" + "="*60)
print("üìä CR√âATION DE LA TABLE DIS_RESULT (TOUTES ANN√âES)")
print("="*60)

try:
    all_result_dfs = []
    result_stats = []
    
    # Lire tous les fichiers DIS_RESULT et ajouter une colonne "annee"
    for file in result_files:
        year = file.name.split('_')[2]
        
        print(f"\nüîÑ Lecture : DIS_RESULT {year}")
        print(f"   üìÇ Fichier : {file.name}")
        
        # Lire le fichier Parquet
        df = spark.read.parquet(file.path)
        
        # Ajouter la colonne "annee"
        df = df.withColumn("annee", lit(year))
        
        row_count = df.count()
        print(f"   üìä Lignes : {row_count:,}")
        
        all_result_dfs.append(df)
        result_stats.append({'year': year, 'rows': row_count})
    
    if all_result_dfs:
        print(f"\nüîó Fusion de {len(all_result_dfs)} fichiers DIS_RESULT...")
        
        # Fusionner tous les DataFrames
        df_result_consolidated = all_result_dfs[0]
        for df in all_result_dfs[1:]:
            df_result_consolidated = df_result_consolidated.union(df)
        
        # Compter le total de lignes
        total_result_rows = df_result_consolidated.count()
        print(f"‚úÖ Total de lignes apr√®s fusion : {total_result_rows:,}")
        
        # Cr√©er la table unique
        table_name = "dis_result"
        print(f"\nüíæ Cr√©ation de la table '{table_name}'...")
        
        df_result_consolidated.write.mode("overwrite").saveAsTable(f"{DATABASE_NAME}.{table_name}")
        
        print(f"‚úÖ Table '{DATABASE_NAME}.{table_name}' cr√©√©e avec succ√®s")
        print(f"   üìä Lignes totales : {total_result_rows:,}")
        print(f"   üìä Colonnes : {len(df_result_consolidated.columns)}")
        print(f"   üìÖ Ann√©es incluses : {', '.join([s['year'] for s in result_stats])}")
        
    else:
        print("‚ö†Ô∏è  Aucun fichier DIS_RESULT √† traiter")

except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")
    raise

print("\n" + "="*60)

In [0]:
# # üóëÔ∏è Suppression des tables annuelles
# print("üóëÔ∏è  Suppression des tables annuelles...\n")

# tables_to_drop = [
#     'dis_plv_2021', 'dis_plv_2022', 'dis_plv_2023', 'dis_plv_2024', 'dis_plv_2025',
#     'dis_result_2021', 'dis_result_2022', 'dis_result_2023', 'dis_result_2024', 'dis_result_2025'
# ]

# for table in tables_to_drop:
#     try:
#         spark.sql(f"DROP TABLE IF EXISTS eau_potable.{table}")
#         print(f"   ‚úÖ Table '{table}' supprim√©e")
#     except Exception as e:
#         print(f"   ‚ùå Erreur: {e}")

# print("\n" + "="*60)
# print("üìã Tables restantes:")
# print("="*60)

# result = spark.sql("SHOW TABLES IN eau_potable").collect()
# print(f"\n   Total: {len(result)} tables\n")
# for row in result:
#     print(f"   ‚úÖ {row.tableName}")

üìå CELLULE 6 : V√©rification des tables cr√©√©es

In [0]:
print("\n" + "="*60)
print("üîç V√âRIFICATION DES TABLES CR√â√âES")
print("="*60)

# Lister toutes les tables de la base
tables = spark.sql(f"SHOW TABLES IN {DATABASE_NAME}").collect()

print(f"\nüìã Tables dans '{DATABASE_NAME}' :")
print(f"   Total : {len(tables)} tables\n")

for table in tables:
    print(f"   ‚úÖ {table.tableName}")

print("\n" + "="*60)

üìå CELLULE 7 : Statistiques d√©taill√©es des tables

In [0]:
print("\n" + "="*60)
print("üìä STATISTIQUES DES TABLES")
print("="*60)

# Statistiques DIS_PLV
print("\nüîµ Table DIS_PLV :")
print("-" * 60)

try:
    df_plv = spark.table(f"{DATABASE_NAME}.dis_plv")
    total_plv = df_plv.count()
    nb_colonnes_plv = len(df_plv.columns)
    
    print(f"   üìä Lignes totales : {total_plv:,}")
    print(f"   üìä Colonnes : {nb_colonnes_plv}")
    
    # Compter par ann√©e
    print(f"\n   üìÖ R√©partition par ann√©e :")
    df_plv.groupBy("annee").count().orderBy("annee").show()
    
except Exception as e:
    print(f"   ‚ùå Erreur : {str(e)}")

# Statistiques DIS_RESULT
print("\nüü¢ Table DIS_RESULT :")
print("-" * 60)

try:
    df_result = spark.table(f"{DATABASE_NAME}.dis_result")
    total_result = df_result.count()
    nb_colonnes_result = len(df_result.columns)
    
    print(f"   üìä Lignes totales : {total_result:,}")
    print(f"   üìä Colonnes : {nb_colonnes_result}")
    
    # Compter par ann√©e
    print(f"\n   üìÖ R√©partition par ann√©e :")
    df_result.groupBy("annee").count().orderBy("annee").show()
    
except Exception as e:
    print(f"   ‚ùå Erreur : {str(e)}")

print("\n" + "="*60)

üìå CELLULE 8 : Sch√©ma des tables

In [0]:
print("\n" + "="*60)
print("üìã SCH√âMA DES TABLES")
print("="*60)

# Sch√©ma DIS_PLV
print(f"\nüîµ Sch√©ma de 'dis_plv' :")
print("-" * 60)

try:
    df_plv = spark.table(f"{DATABASE_NAME}.dis_plv")
    df_plv.printSchema()
    
    print(f"\nüìã Colonnes ({len(df_plv.columns)}) :")
    for idx, col_name in enumerate(df_plv.columns, 1):
        print(f"   {idx:2d}. {col_name}")
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

# Sch√©ma DIS_RESULT
print(f"\nüü¢ Sch√©ma de 'dis_result' :")
print("-" * 60)

try:
    df_result = spark.table(f"{DATABASE_NAME}.dis_result")
    df_result.printSchema()
    
    print(f"\nüìã Colonnes ({len(df_result.columns)}) :")
    for idx, col_name in enumerate(df_result.columns, 1):
        print(f"   {idx:2d}. {col_name}")
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

print("\n" + "="*60)

üìå CELLULE 9 : Aper√ßu des donn√©es


In [0]:
print("\n" + "="*60)
print("üëÄ APER√áU DES DONN√âES")
print("="*60)

# Aper√ßu DIS_PLV
print(f"\nüìÑ Aper√ßu de 'dis_plv' (5 premi√®res lignes) :")
print("-" * 80)

try:
    spark.sql(f"SELECT * FROM {DATABASE_NAME}.dis_plv LIMIT 5").show(truncate=50, vertical=False)
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

# Aper√ßu DIS_RESULT
print(f"\nüìÑ Aper√ßu de 'dis_result' (5 premi√®res lignes) :")
print("-" * 80)

try:
    spark.sql(f"SELECT * FROM {DATABASE_NAME}.dis_result LIMIT 5").show(truncate=50, vertical=False)
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

print("="*60)

üìå CELLULE 10 : Exemples de requ√™tes SQL


In [0]:
print("\n" + "="*60)
print("üí° EXEMPLES DE REQU√äTES SQL")
print("="*60)

# Exemple 1 : Donn√©es PLV pour un pr√©l√®vement sp√©cifique
print(f"\nüîµ Exemple 1 : Donn√©es PLV pour referenceprel = 00100143925")
print("-" * 80)
try:
    spark.sql(f"""
        SELECT 
            referenceprel,
            cddept,
            nomcommuneprinc,
            dateprel,
            heureprel,
            conclusionprel,
            plvconformitebacterio,
            plvconformitechimique,
            ugelib,
            distrlib
        FROM {DATABASE_NAME}.dis_plv
        WHERE referenceprel = '00100143925'
    """).show(truncate=False, vertical=True)
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

# Exemple 2 : R√©sultats d'analyses pour ce m√™me pr√©l√®vement
print(f"\nüü¢ Exemple 2 : R√©sultats d'analyses pour referenceprel = 00100143925")
print("-" * 80)
try:
    spark.sql(f"""
        SELECT 
            referenceprel,
            cdparametre,
            libmajparametre,
            valtraduite,
            cdunitereference,
            limitequal,
            refqual
        FROM {DATABASE_NAME}.dis_result
        WHERE referenceprel = '00100143925'
        ORDER BY libmajparametre
    """).show(truncate=False)
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

# Exemple 3 : Jointure compl√®te PLV + RESULT pour ce pr√©l√®vement
print(f"\nüîó Exemple 3 : Jointure PLV + RESULT pour referenceprel = 00100143925")
print("-" * 80)
try:
    spark.sql(f"""
        SELECT 
            p.referenceprel,
            p.nomcommuneprinc,
            p.dateprel,
            p.heureprel,
            p.plvconformitebacterio,
            p.plvconformitechimique,
            r.libmajparametre,
            r.valtraduite,
            r.cdunitereference,
            r.limitequal,
            r.refqual
        FROM {DATABASE_NAME}.dis_plv p
        INNER JOIN {DATABASE_NAME}.dis_result r
            ON p.referenceprel = r.referenceprel
            AND p.cddept = r.cddept
        WHERE p.referenceprel = '00100143925'
        ORDER BY r.libmajparametre
    """).show(truncate=False)
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

# Exemple 4 : Nombre de param√®tres analys√©s pour ce pr√©l√®vement
print(f"\nüìä Exemple 4 : Statistiques pour referenceprel = 00100143925")
print("-" * 80)
try:
    spark.sql(f"""
        SELECT 
            p.referenceprel,
            p.nomcommuneprinc,
            p.dateprel,
            COUNT(r.cdparametre) as nb_parametres_analyses,
            p.plvconformitebacterio,
            p.plvconformitechimique
        FROM {DATABASE_NAME}.dis_plv p
        LEFT JOIN {DATABASE_NAME}.dis_result r
            ON p.referenceprel = r.referenceprel
            AND p.cddept = r.cddept
        WHERE p.referenceprel = '00100143925'
        GROUP BY 
            p.referenceprel,
            p.nomcommuneprinc,
            p.dateprel,
            p.plvconformitebacterio,
            p.plvconformitechimique
    """).show(truncate=False, vertical=True)
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

# Exemple 5 : Top 10 d√©partements par nombre de pr√©l√®vements
print(f"\nüîµ Exemple 5 : Nombre de pr√©l√®vements par d√©partement (TOP 10)")
print("-" * 80)
try:
    spark.sql(f"""
        SELECT 
            cddept,
            COUNT(*) as nb_prelevements
        FROM {DATABASE_NAME}.dis_plv
        GROUP BY cddept
        ORDER BY nb_prelevements DESC
        LIMIT 10
    """).show()
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

# Exemple 6 : Top 10 param√®tres les plus analys√©s
print(f"\nüü¢ Exemple 6 : Top 10 param√®tres les plus analys√©s")
print("-" * 80)
try:
    spark.sql(f"""
        SELECT 
            libmajparametre,
            COUNT(*) as nb_analyses
        FROM {DATABASE_NAME}.dis_result
        GROUP BY libmajparametre
        ORDER BY nb_analyses DESC
        LIMIT 10
    """).show(truncate=50)
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

# Exemple 7 : Jointure g√©n√©rale PLV + RESULT (√©chantillon)
print(f"\nüîó Exemple 7 : Jointure PLV + RESULT (√©chantillon de 10 lignes)")
print("-" * 80)
try:
    spark.sql(f"""
        SELECT 
            p.referenceprel,
            p.nomcommuneprinc,
            p.dateprel,
            r.libmajparametre,
            r.valtraduite,
            r.cdunitereference
        FROM {DATABASE_NAME}.dis_plv p
        INNER JOIN {DATABASE_NAME}.dis_result r
            ON p.referenceprel = r.referenceprel 
            AND p.cddept = r.cddept
        LIMIT 10
    """).show(truncate=40)
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

print("="*60)
print("‚úÖ Exemples de requ√™tes SQL termin√©s")
print("="*60)