# ## 1. Configuration et connexion au Storage Azure


In [0]:

import os
import zipfile
import io
from datetime import datetime
import re
from pyspark.sql.functions import lit, col, count, sum as spark_sum
from pyspark.sql.types import StructType, StructField, StringType

print("="*60)
print("üîß CONFIGURATION DATABRICKS ‚Üí AZURE STORAGE")
print("="*60)

# R√©cup√©rer les variables d'environnement configur√©es dans le cluster
storage_account_name = os.environ.get("AZURE_STORAGE_ACCOUNT_NAME")
storage_account_key = os.environ.get("AZURE_STORAGE_ACCOUNT_KEY")

# V√©rification
if not storage_account_name or not storage_account_key:
    raise ValueError("‚ùå Les variables d'environnement ne sont pas configur√©es dans le cluster")

print("‚úÖ Variables d'environnement r√©cup√©r√©es avec succ√®s")
print(f"üì¶ Compte de stockage : {storage_account_name}")

# Configuration de la connexion Spark
spark.conf.set(
    f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net",
    storage_account_key
)

print("‚úÖ Configuration Spark effectu√©e")

# ## 2. Montage des conteneurs RAW et BRONZE


In [0]:

# %%
def mount_container(container_name, mount_point):
    """
    Monte un conteneur Blob Storage dans le syst√®me de fichiers Databricks
    G√®re automatiquement ADLS Gen2 et Blob Storage classique
    
    Args:
        container_name (str): Nom du conteneur (raw, bronze, etc.)
        mount_point (str): Point de montage dans DBFS (ex: /mnt/raw)
    """
    try:
        # V√©rifier si d√©j√† mont√©
        if any(mount.mountPoint == mount_point for mount in dbutils.fs.mounts()):
            print(f"‚ö†Ô∏è  {mount_point} est d√©j√† mont√©. D√©montage...")
            dbutils.fs.unmount(mount_point)
        
        # M√©thode 1 : WASBS (Blob Storage Gen V2 sans Hierarchical Namespace)
        print(f"üîÑ Tentative de montage avec WASBS pour '{container_name}'...")
        
        configs = {
            f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": storage_account_key
        }
        
        source = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/"
        
        dbutils.fs.mount(
            source=source,
            mount_point=mount_point,
            extra_configs=configs
        )
        
        print(f"‚úÖ Conteneur '{container_name}' mont√© sur '{mount_point}' (WASBS)")
        return True
        
    except Exception as e:
        print(f"‚ö†Ô∏è  √âchec WASBS : {str(e)}")
        
        # M√©thode 2 : ABFSS (ADLS Gen2 avec Hierarchical Namespace)
        try:
            print(f"üîÑ Tentative de montage avec ABFSS pour '{container_name}'...")
            
            # D√©monter si la premi√®re tentative a partiellement r√©ussi
            try:
                dbutils.fs.unmount(mount_point)
            except:
                pass
            
            configs = {
                f"fs.azure.account.key.{storage_account_name}.dfs.core.windows.net": storage_account_key
            }
            
            source = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/"
            
            dbutils.fs.mount(
                source=source,
                mount_point=mount_point,
                extra_configs=configs
            )
            
            print(f"‚úÖ Conteneur '{container_name}' mont√© sur '{mount_point}' (ABFSS)")
            return True
            
        except Exception as e2:
            print(f"‚ùå √âchec ABFSS : {str(e2)}")
            print(f"‚ùå Impossible de monter '{container_name}'")
            return False

# Monter les conteneurs
print("\n" + "="*60)
print("üîó MONTAGE DES CONTENEURS")
print("="*60)

mount_container("raw", "/mnt/raw")
print()
mount_container("bronze", "/mnt/bronze")

print("="*60)
################################################################################################################################################
################################################################################################################################################
# %% [markdown]
# ## 3. V√©rification des connexions ########################################################################
################################################################################################################################################

# %%
print("\n" + "="*60)
print("üìÇ V√âRIFICATION DES MONTAGES")
print("="*60)

# Lister tous les montages
for mount in dbutils.fs.mounts():
    print(f"üìç {mount.mountPoint} -> {mount.source}")

# %%
print("\n" + "="*60)
print("üìÇ CONTENU DU CONTENEUR RAW")
print("="*60)

try:
    files = dbutils.fs.ls("/mnt/raw")
    
    if len(files) == 0:
        print("üì≠ Le conteneur RAW est vide")
    else:
        for idx, file in enumerate(files, 1):
            size_mb = file.size / (1024 * 1024)
            print(f"\n{idx}. {file.name}")
            print(f"   üìè Taille : {size_mb:.2f} MB")
            print(f"   üìç Chemin : {file.path}")
            
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

# %%
print("\n" + "="*60)
print("üìÇ CONTENU DU CONTENEUR BRONZE")
print("="*60)

try:
    files = dbutils.fs.ls("/mnt/bronze")
    
    if len(files) == 0:
        print("üì≠ Le conteneur BRONZE est vide")
    else:
        for idx, file in enumerate(files, 1):
            size_mb = file.size / (1024 * 1024)
            print(f"\n{idx}. {file.name}")
            print(f"   üìè Taille : {size_mb:.2f} MB")
            print(f"   üìç Chemin : {file.path}")
            
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

## 4. Extraction des fichiers ZIP (DIS_PLV + DIS_RESULT)


In [0]:
# %% [markdown]
# ## 4. Extraction des fichiers ZIP (DIS_PLV + DIS_RESULT)

# %%
print("\n" + "="*60)
print("üöÄ PIPELINE D'EXTRACTION RAW ‚Üí BRONZE")
print("="*60)

# %% [markdown]
# ### 4.1. Fonction d'extraction DIS_PLV + DIS_RESULT

# %%
def extract_dis_files_from_zip(zip_path, destination_container="/mnt/bronze"):
    """
    Extrait les fichiers DIS_PLV_*.txt et DIS_RESULT_*.txt d'un ZIP et les charge dans BRONZE
    
    Args:
        zip_path (str): Chemin du fichier ZIP dans RAW
        destination_container (str): Chemin du conteneur de destination
    
    Returns:
        dict: Statistiques de traitement
    """
    stats = {
        'zip_name': zip_path.split('/')[-1],
        'total_files': 0,
        'dis_plv_files': 0,
        'dis_result_files': 0,
        'extracted_plv': [],
        'extracted_result': [],
        'skipped_files': 0,
        'errors': []
    }
    
    try:
        print(f"\n{'='*60}")
        print(f"üì¶ Traitement : {stats['zip_name']}")
        print(f"{'='*60}")
        
        # Nettoyer le chemin
        clean_path = zip_path.replace('dbfs:', '')
        full_path = f"/dbfs{clean_path}"
        
        print(f"üîç Chemin utilis√© : {full_path}")
        
        # Lire le fichier ZIP depuis DBFS
        with open(full_path, "rb") as f:
            zip_data = f.read()
        
        file_size_mb = len(zip_data) / (1024 * 1024)
        print(f"üì¶ Taille du ZIP : {file_size_mb:.2f} MB")
        
        # Ouvrir le ZIP en m√©moire
        with zipfile.ZipFile(io.BytesIO(zip_data)) as zip_ref:
            file_list = zip_ref.namelist()
            stats['total_files'] = len(file_list)
            
            print(f"üìÇ Nombre total de fichiers dans le ZIP : {stats['total_files']}")
            
            # Filtrer les fichiers DIS_PLV et DIS_RESULT
            dis_plv_files = [f for f in file_list if re.match(r'DIS_PLV_\d{4}_\d{3}\.txt$', f)]
            dis_result_files = [f for f in file_list if re.match(r'DIS_RESULT_\d{4}_\d{3}\.txt$', f)]
            
            stats['dis_plv_files'] = len(dis_plv_files)
            stats['dis_result_files'] = len(dis_result_files)
            
            print(f"‚úÖ Fichiers DIS_PLV trouv√©s : {stats['dis_plv_files']}")
            print(f"‚úÖ Fichiers DIS_RESULT trouv√©s : {stats['dis_result_files']}")
            
            # Afficher les autres types de fichiers (pour info)
            other_files = [f for f in file_list 
                          if not re.match(r'DIS_PLV_\d{4}_\d{3}\.txt$', f) 
                          and not re.match(r'DIS_RESULT_\d{4}_\d{3}\.txt$', f)]
            dis_com_count = len([f for f in other_files if 'DIS_COM' in f])
            other_count = len(other_files) - dis_com_count
            
            print(f"‚è≠Ô∏è  Fichiers ignor√©s :")
            print(f"   - DIS_COM : {dis_com_count}")
            print(f"   - Autres : {other_count}")
            
            if stats['dis_plv_files'] == 0 and stats['dis_result_files'] == 0:
                print("‚ö†Ô∏è  Aucun fichier DIS_PLV ou DIS_RESULT trouv√© dans ce ZIP")
                return stats
            
            # Extraire les fichiers DIS_PLV
            if dis_plv_files:
                print(f"\nüîÑ Extraction des fichiers DIS_PLV...")
                for idx, file_name in enumerate(dis_plv_files, 1):
                    try:
                        print(f"   [{idx}/{len(dis_plv_files)}] {file_name}...", end=" ")
                        
                        file_content = zip_ref.read(file_name)
                        destination_path = f"{destination_container}/{file_name}"
                        
                        try:
                            dbutils.fs.put(destination_path, file_content.decode('utf-8'), overwrite=True)
                        except UnicodeDecodeError:
                            dbutils.fs.put(destination_path, file_content.decode('latin-1'), overwrite=True)
                        
                        stats['extracted_plv'].append(file_name)
                        file_size_kb = len(file_content) / 1024
                        print(f"‚úÖ ({file_size_kb:.1f} KB)")
                        
                    except Exception as e:
                        error_msg = f"Erreur sur {file_name}: {str(e)}"
                        stats['errors'].append(error_msg)
                        print(f"‚ùå {str(e)}")
            
            # Extraire les fichiers DIS_RESULT
            if dis_result_files:
                print(f"\nüîÑ Extraction des fichiers DIS_RESULT...")
                for idx, file_name in enumerate(dis_result_files, 1):
                    try:
                        print(f"   [{idx}/{len(dis_result_files)}] {file_name}...", end=" ")
                        
                        file_content = zip_ref.read(file_name)
                        destination_path = f"{destination_container}/{file_name}"
                        
                        try:
                            dbutils.fs.put(destination_path, file_content.decode('utf-8'), overwrite=True)
                        except UnicodeDecodeError:
                            dbutils.fs.put(destination_path, file_content.decode('latin-1'), overwrite=True)
                        
                        stats['extracted_result'].append(file_name)
                        file_size_kb = len(file_content) / 1024
                        print(f"‚úÖ ({file_size_kb:.1f} KB)")
                        
                    except Exception as e:
                        error_msg = f"Erreur sur {file_name}: {str(e)}"
                        stats['errors'].append(error_msg)
                        print(f"‚ùå {str(e)}")
            
            # Compter les fichiers ignor√©s
            stats['skipped_files'] = stats['total_files'] - stats['dis_plv_files'] - stats['dis_result_files']
            
            print(f"\n‚úÖ Extraction termin√©e :")
            print(f"   - DIS_PLV : {len(stats['extracted_plv'])}/{stats['dis_plv_files']}")
            print(f"   - DIS_RESULT : {len(stats['extracted_result'])}/{stats['dis_result_files']}")
            
    except Exception as e:
        error_msg = f"Erreur g√©n√©rale sur {stats['zip_name']}: {str(e)}"
        stats['errors'].append(error_msg)
        print(f"‚ùå {error_msg}")
    
    return stats

# %% [markdown]
# ### 4.2. Traitement de tous les fichiers ZIP

# %%
print("\n" + "="*60)
print("üîç RECHERCHE DES FICHIERS ZIP DANS RAW")
print("="*60)

try:
    raw_files = dbutils.fs.ls("/mnt/raw")
    zip_files = [f for f in raw_files if f.name.endswith('.zip')]
    
    print(f"üì¶ Nombre de fichiers ZIP trouv√©s : {len(zip_files)}")
    
    if len(zip_files) == 0:
        print("‚ö†Ô∏è  Aucun fichier ZIP trouv√© dans RAW")
    else:
        for idx, file in enumerate(zip_files, 1):
            size_mb = file.size / (1024 * 1024)
            print(f"   {idx}. {file.name} ({size_mb:.2f} MB)")
            
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")
    zip_files = []

# %%
print("\n" + "="*60)
print("‚öôÔ∏è  D√âMARRAGE DE L'EXTRACTION")
print("="*60)

all_stats = []

for zip_file in zip_files:
    stats = extract_dis_files_from_zip(zip_file.path)
    all_stats.append(stats)

print("\n" + "="*60)
print("‚úÖ EXTRACTION TERMIN√âE")
print("="*60)

# %% [markdown]
# ### 4.3. Rapport d√©taill√© d'extraction

# %%
print("\n" + "="*60)
print("üìä RAPPORT D√âTAILL√â D'EXTRACTION")
print("="*60)

total_zips = len(all_stats)
total_files_in_zips = sum(s['total_files'] for s in all_stats)
total_dis_plv = sum(s['dis_plv_files'] for s in all_stats)
total_dis_result = sum(s['dis_result_files'] for s in all_stats)
total_extracted_plv = sum(len(s['extracted_plv']) for s in all_stats)
total_extracted_result = sum(len(s['extracted_result']) for s in all_stats)
total_skipped = sum(s['skipped_files'] for s in all_stats)
total_errors = sum(len(s['errors']) for s in all_stats)

print(f"\nüì¶ Fichiers ZIP trait√©s : {total_zips}")
print(f"üìÇ Total de fichiers dans les ZIP : {total_files_in_zips}")
print(f"\n‚úÖ Fichiers identifi√©s :")
print(f"   - DIS_PLV : {total_dis_plv}")
print(f"   - DIS_RESULT : {total_dis_result}")
print(f"\nüíæ Fichiers extraits dans BRONZE :")
print(f"   - DIS_PLV : {total_extracted_plv}")
print(f"   - DIS_RESULT : {total_extracted_result}")
print(f"   - TOTAL : {total_extracted_plv + total_extracted_result}")
print(f"\n‚è≠Ô∏è  Fichiers ignor√©s (DIS_COM, etc.) : {total_skipped}")

if total_errors > 0:
    print(f"‚ùå Erreurs rencontr√©es : {total_errors}")
else:
    print(f"‚úÖ Aucune erreur")

# D√©tail par ZIP
print("\n" + "="*60)
print("üìã D√âTAIL PAR FICHIER ZIP")
print("="*60)

for stats in all_stats:
    print(f"\nüì¶ {stats['zip_name']}")
    print(f"   üìÇ Total fichiers : {stats['total_files']}")
    print(f"   ‚úÖ DIS_PLV trouv√©s : {stats['dis_plv_files']} (extraits : {len(stats['extracted_plv'])})")
    print(f"   ‚úÖ DIS_RESULT trouv√©s : {stats['dis_result_files']} (extraits : {len(stats['extracted_result'])})")
    print(f"   ‚è≠Ô∏è  Ignor√©s : {stats['skipped_files']}")
    
    if stats['errors']:
        print(f"   ‚ùå Erreurs : {len(stats['errors'])}")
        for error in stats['errors'][:3]:  # Afficher max 3 erreurs
            print(f"      - {error}")

# %% [markdown]
# ### 4.4. V√©rification du conteneur BRONZE

# %%
print("\n" + "="*60)
print("üîç √âTAT DU CONTENEUR BRONZE APR√àS EXTRACTION")
print("="*60)

try:
    bronze_files = dbutils.fs.ls("/mnt/bronze")
    
    # S√©parer par type
    plv_files = [f for f in bronze_files if 'DIS_PLV' in f.name and 'consolidated' not in f.name]
    result_files = [f for f in bronze_files if 'DIS_RESULT' in f.name and 'consolidated' not in f.name]
    other_files = [f for f in bronze_files if 'DIS_PLV' not in f.name and 'DIS_RESULT' not in f.name]
    
    print(f"üìÇ Contenu de BRONZE :")
    print(f"   - Fichiers DIS_PLV : {len(plv_files)}")
    print(f"   - Fichiers DIS_RESULT : {len(result_files)}")
    print(f"   - Autres fichiers : {len(other_files)}")
    print(f"   - TOTAL : {len(bronze_files)}")
    
    # Taille totale
    total_size = sum(f.size for f in bronze_files) / (1024*1024)
    print(f"\nüíæ Taille totale : {total_size:.2f} MB")
    
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

# %% [markdown]
# ### 4.5. Aper√ßu des fichiers extraits

# %%
print("\n" + "="*60)
print("üëÄ APER√áU DES FICHIERS EXTRAITS")
print("="*60)

try:
    bronze_files = dbutils.fs.ls("/mnt/bronze")
    
    # Aper√ßu DIS_PLV
    plv_files = [f for f in bronze_files if 'DIS_PLV' in f.name and '.txt' in f.name]
    if plv_files:
        sample_plv = plv_files[0]
        print(f"üìÑ Exemple DIS_PLV : {sample_plv.name}")
        print(f"üìè Taille : {sample_plv.size / 1024:.2f} KB")
        
        clean_path = sample_plv.path.replace('dbfs:', '')
        full_path = f"/dbfs{clean_path}"
        
        with open(full_path, "r", encoding="utf-8") as f:
            lines = [f.readline() for _ in range(5)]
        
        print("\nüìù Premi√®res lignes DIS_PLV :")
        print("-" * 80)
        for i, line in enumerate(lines):
            content = line.strip()[:100]
            print(f"   {i+1}. {content}...")
    
    # Aper√ßu DIS_RESULT
    result_files = [f for f in bronze_files if 'DIS_RESULT' in f.name and '.txt' in f.name]
    if result_files:
        sample_result = result_files[0]
        print(f"\nüìÑ Exemple DIS_RESULT : {sample_result.name}")
        print(f"üìè Taille : {sample_result.size / 1024:.2f} KB")
        
        clean_path = sample_result.path.replace('dbfs:', '')
        full_path = f"/dbfs{clean_path}"
        
        with open(full_path, "r", encoding="utf-8") as f:
            lines = [f.readline() for _ in range(5)]
        
        print("\nüìù Premi√®res lignes DIS_RESULT :")
        print("-" * 80)
        for i, line in enumerate(lines):
            content = line.strip()[:100]
            print(f"   {i+1}. {content}...")
    
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

## 5. Consolidation des fichiers par ann√©e et par type

In [0]:
# %% [markdown]
# ## 5. Consolidation en UN SEUL fichier Parquet par ann√©e et par type

# %%
print("\n" + "="*60)
print("üîÑ CONSOLIDATION EN FICHIER PARQUET UNIQUE")
print("="*60)

# %% [markdown]
# ### 5.1. Analyse des fichiers dans BRONZE

# %%
print("\n" + "="*60)
print("üìÇ ANALYSE DES FICHIERS DANS BRONZE")
print("="*60)

try:
    bronze_files = dbutils.fs.ls("/mnt/bronze")
    
    # Grouper par ann√©e et par type
    plv_by_year = {}
    result_by_year = {}
    
    for file in bronze_files:
        # DIS_PLV
        match_plv = re.search(r'DIS_PLV_(\d{4})_\d{3}\.txt', file.name)
        if match_plv:
            year = match_plv.group(1)
            if year not in plv_by_year:
                plv_by_year[year] = []
            plv_by_year[year].append(file)
        
        # DIS_RESULT
        match_result = re.search(r'DIS_RESULT_(\d{4})_\d{3}\.txt', file.name)
        if match_result:
            year = match_result.group(1)
            if year not in result_by_year:
                result_by_year[year] = []
            result_by_year[year].append(file)
    
    print(f"üìä R√©partition DIS_PLV par ann√©e :")
    total_plv = 0
    size_plv = 0
    for year in sorted(plv_by_year.keys()):
        count = len(plv_by_year[year])
        size = sum(f.size for f in plv_by_year[year]) / (1024*1024)
        total_plv += count
        size_plv += size
        print(f"   {year} : {count:3d} fichiers ({size:7.2f} MB)")
    print(f"   TOTAL : {total_plv:3d} fichiers ({size_plv:7.2f} MB)")
    
    print(f"\nüìä R√©partition DIS_RESULT par ann√©e :")
    total_result = 0
    size_result = 0
    for year in sorted(result_by_year.keys()):
        count = len(result_by_year[year])
        size = sum(f.size for f in result_by_year[year]) / (1024*1024)
        total_result += count
        size_result += size
        print(f"   {year} : {count:3d} fichiers ({size:7.2f} MB)")
    print(f"   TOTAL : {total_result:3d} fichiers ({size_result:7.2f} MB)")
    
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")
    plv_by_year = {}
    result_by_year = {}

# %% [markdown]
# ### 5.2. Fonction de consolidation en UN SEUL fichier

# %%
def consolidate_to_single_parquet(year, file_list, file_type):
    """
    Consolide tous les fichiers en UN SEUL fichier Parquet
    
    Args:
        year (str): Ann√©e √† consolider
        file_list (list): Liste des fichiers FileInfo
        file_type (str): 'PLV' ou 'RESULT'
    
    Returns:
        dict: Statistiques de consolidation
    """
    stats = {
        'year': year,
        'type': file_type,
        'input_files': len(file_list),
        'total_rows': 0,
        'output_file': '',
        'output_size_mb': 0,
        'success': False,
        'error': None
    }
    
    try:
        print(f"\n{'='*60}")
        print(f"üìÖ Consolidation DIS_{file_type} ann√©e {year}")
        print(f"{'='*60}")
        print(f"üìÇ Nombre de fichiers √† fusionner : {len(file_list)}")
        
        # D√©finir les colonnes selon le type
        if file_type == 'PLV':
            columns = [
                'cddept', 'cdreseau', 'inseecommuneprinc', 'nomcommuneprinc', 
                'cdreseauamont', 'nomreseauamont', 'pourcentdebit', 'referenceprel', 
                'dateprel', 'heureprel', 'conclusionprel', 'ugelib', 'distrlib', 
                'moalib', 'plvconformitebacterio', 'plvconformitechimique', 
                'plvconformitereferencebact', 'plvconformitereferencechim'
            ]
        else:  # RESULT
            columns = [
                'cddept', 'referenceprel', 'cdparametresiseeaux', 'cdparametre', 
                'libmajparametre', 'libminparametre', 'libwebparametre', 'qualitparam', 
                'insituana', 'rqana', 'cdunitereferencesiseeaux', 'cdunitereference', 
                'limitequal', 'refqual', 'valtraduite', 'casparam', 'referenceanl'
            ]
        
        # Lire tous les fichiers avec Spark
        print(f"\nüîÑ Lecture et fusion des fichiers...")
        all_dataframes = []
        
        for idx, file in enumerate(file_list, 1):
            try:
                # Afficher seulement les 3 premiers, les 3 derniers, et un message interm√©diaire
                if idx <= 3 or idx > len(file_list) - 3:
                    print(f"   [{idx}/{len(file_list)}] {file.name}...", end=" ")
                elif idx == 4:
                    print(f"   ... lecture des fichiers 4 √† {len(file_list)-3} en cours ...")
                
                file_path = file.path
                
                # Lire avec les colonnes d√©finies
                df = spark.read.csv(
                    file_path,
                    header=True,
                    inferSchema=False,  # Tout en string pour √©viter les erreurs
                    sep=",",
                    quote='"',
                    escape='"',
                    encoding="UTF-8"
                )
                
                # V√©rifier que les colonnes correspondent
                if set(df.columns) == set(columns):
                    row_count = df.count()
                    if idx <= 3 or idx > len(file_list) - 3:
                        print(f"‚úÖ ({row_count:,} lignes)")
                    all_dataframes.append(df)
                else:
                    if idx <= 3 or idx > len(file_list) - 3:
                        print(f"‚ö†Ô∏è  Colonnes diff√©rentes, ignor√©")
                
            except Exception as e:
                if idx <= 3 or idx > len(file_list) - 3:
                    print(f"‚ùå {str(e)}")
                continue
        
        if not all_dataframes:
            stats['error'] = "Aucun fichier n'a pu √™tre lu"
            print(f"\n‚ùå {stats['error']}")
            return stats
        
        # Fusionner tous les DataFrames
        print(f"\nüîó Fusion de {len(all_dataframes)} DataFrames...")
        consolidated_df = all_dataframes[0]
        
        for idx, df in enumerate(all_dataframes[1:], 2):
            consolidated_df = consolidated_df.union(df)
            if idx % 20 == 0:  # Afficher la progression tous les 20 fichiers
                print(f"   ... {idx}/{len(all_dataframes)} DataFrames fusionn√©s")
        
        print(f"‚úÖ Fusion termin√©e")
        
        # Compter le nombre total de lignes
        print(f"\nüìä Comptage des lignes...")
        total_rows = consolidated_df.count()
        stats['total_rows'] = total_rows
        print(f"‚úÖ Total de lignes : {total_rows:,}")
        
        # Chemin de sortie TEMPORAIRE (dossier)
        temp_output_path = f"/mnt/bronze/temp_DIS_{file_type}_{year}_consolidated"
        final_output_file = f"/mnt/bronze/DIS_{file_type}_{year}_consolidated.parquet"
        
        stats['output_file'] = final_output_file
        
        # IMPORTANT : Utiliser coalesce(1) pour cr√©er UN SEUL fichier
        print(f"\nüíæ √âcriture en UN SEUL fichier Parquet...")
        print(f"   (Cette √©tape peut prendre du temps selon la taille des donn√©es)")
        
        # Supprimer le dossier temporaire s'il existe
        try:
            dbutils.fs.rm(temp_output_path, recurse=True)
        except:
            pass
        
        # √âcrire avec coalesce(1) pour avoir UN SEUL fichier
        consolidated_df.coalesce(1).write.mode('overwrite').parquet(temp_output_path)
        
        print(f"‚úÖ Fichier √©crit dans le dossier temporaire")
        
        # Trouver le fichier .parquet dans le dossier temporaire
        print(f"\nüì¶ Renommage en fichier unique...")
        temp_files = dbutils.fs.ls(temp_output_path)
        parquet_file = [f for f in temp_files if f.name.startswith('part-') and f.name.endswith('.parquet')]
        
        if parquet_file:
            # Copier le fichier parquet vers le nom final
            dbutils.fs.cp(parquet_file[0].path, final_output_file)
            
            # Supprimer le dossier temporaire
            dbutils.fs.rm(temp_output_path, recurse=True)
            
            print(f"‚úÖ Fichier unique cr√©√© : DIS_{file_type}_{year}_consolidated.parquet")
            
            # Calculer la taille
            file_info = dbutils.fs.ls(final_output_file)
            if file_info:
                output_size = file_info[0].size / (1024*1024)
                stats['output_size_mb'] = output_size
                print(f"üìè Taille : {output_size:.2f} MB")
        else:
            # Si pas de fichier part-, garder le dossier (mode standard)
            stats['output_file'] = temp_output_path
            output_files = dbutils.fs.ls(temp_output_path)
            output_size = sum(f.size for f in output_files) / (1024*1024)
            stats['output_size_mb'] = output_size
            print(f"‚ö†Ô∏è  Mode dossier conserv√©")
            print(f"üìè Taille : {output_size:.2f} MB")
        
        stats['success'] = True
        
    except Exception as e:
        stats['error'] = str(e)
        print(f"\n‚ùå Erreur lors de la consolidation : {str(e)}")
    
    return stats

# %% [markdown]
# ### 5.3. Consolidation de toutes les ann√©es

# %%
print("\n" + "="*60)
print("‚öôÔ∏è  D√âMARRAGE DE LA CONSOLIDATION")
print("="*60)

consolidation_stats = []

# Consolider DIS_PLV
if plv_by_year:
    print("\nüîµ CONSOLIDATION DES FICHIERS DIS_PLV")
    print("="*60)
    for year in sorted(plv_by_year.keys()):
        stats = consolidate_to_single_parquet(year, plv_by_year[year], 'PLV')
        consolidation_stats.append(stats)

# Consolider DIS_RESULT
if result_by_year:
    print("\nüü¢ CONSOLIDATION DES FICHIERS DIS_RESULT")
    print("="*60)
    for year in sorted(result_by_year.keys()):
        stats = consolidate_to_single_parquet(year, result_by_year[year], 'RESULT')
        consolidation_stats.append(stats)

print("\n" + "="*60)
print("‚úÖ CONSOLIDATION TERMIN√âE")
print("="*60)

# %% [markdown]
# ### 5.4. Rapport de consolidation

# %%
print("\n" + "="*60)
print("üìä RAPPORT DE CONSOLIDATION")
print("="*60)

successful = [s for s in consolidation_stats if s['success']]
failed = [s for s in consolidation_stats if not s['success']]

print(f"\n‚úÖ Consolidations r√©ussies : {len(successful)}/{len(consolidation_stats)}")

if successful:
    # S√©parer PLV et RESULT
    plv_stats = [s for s in successful if s['type'] == 'PLV']
    result_stats = [s for s in successful if s['type'] == 'RESULT']
    
    print(f"\nüìã Statistiques DIS_PLV :")
    if plv_stats:
        total_input_plv = sum(s['input_files'] for s in plv_stats)
        total_rows_plv = sum(s['total_rows'] for s in plv_stats)
        total_size_plv = sum(s['output_size_mb'] for s in plv_stats)
        
        print(f"   üìÇ Fichiers d'entr√©e trait√©s : {total_input_plv}")
        print(f"   üìä Lignes totales : {total_rows_plv:,}")
        print(f"   üíæ Taille totale : {total_size_plv:.2f} MB")
        
        print(f"\n   üìÖ D√©tail par ann√©e :")
        for stats in plv_stats:
            print(f"      {stats['year']} : {stats['input_files']} fichiers ‚Üí {stats['total_rows']:,} lignes ({stats['output_size_mb']:.2f} MB)")
    
    print(f"\nüìã Statistiques DIS_RESULT :")
    if result_stats:
        total_input_result = sum(s['input_files'] for s in result_stats)
        total_rows_result = sum(s['total_rows'] for s in result_stats)
        total_size_result = sum(s['output_size_mb'] for s in result_stats)
        
        print(f"   üìÇ Fichiers d'entr√©e trait√©s : {total_input_result}")
        print(f"   üìä Lignes totales : {total_rows_result:,}")
        print(f"   üíæ Taille totale : {total_size_result:.2f} MB")
        
        print(f"\n   üìÖ D√©tail par ann√©e :")
        for stats in result_stats:
            print(f"      {stats['year']} : {stats['input_files']} fichiers ‚Üí {stats['total_rows']:,} lignes ({stats['output_size_mb']:.2f} MB)")

if failed:
    print(f"\n‚ùå Consolidations √©chou√©es : {len(failed)}")
    for stats in failed:
        print(f"   - {stats['type']} {stats['year']} : {stats['error']}")

# %% [markdown]
# ### 5.5. V√©rification des fichiers consolid√©s

# %%
print("\n" + "="*60)
print("üîç V√âRIFICATION DES FICHIERS CONSOLID√âS")
print("="*60)

try:
    bronze_files_after = dbutils.fs.ls("/mnt/bronze")
    
    # Filtrer les fichiers consolid√©s
    consolidated_plv = [f for f in bronze_files_after if 'DIS_PLV' in f.name and 'consolidated' in f.name and not 'temp_' in f.name]
    consolidated_result = [f for f in bronze_files_after if 'DIS_RESULT' in f.name and 'consolidated' in f.name and not 'temp_' in f.name]
    
    print(f"üìÇ Fichiers consolid√©s cr√©√©s :")
    print(f"   - DIS_PLV : {len(consolidated_plv)}")
    print(f"   - DIS_RESULT : {len(consolidated_result)}")
    print(f"   - TOTAL : {len(consolidated_plv) + len(consolidated_result)}")
    
    if consolidated_plv:
        print(f"\nüì¶ Fichiers DIS_PLV consolid√©s :")
        for file in sorted(consolidated_plv, key=lambda x: x.name):
            size = file.size / (1024*1024)
            print(f"   - {file.name} ({size:.2f} MB)")
    
    if consolidated_result:
        print(f"\nüì¶ Fichiers DIS_RESULT consolid√©s :")
        for file in sorted(consolidated_result, key=lambda x: x.name):
            size = file.size / (1024*1024)
            print(f"   - {file.name} ({size:.2f} MB)")
    
except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

## 6. Suppression des fichiers individuels et v√©rification finale

In [0]:
# %% [markdown]
# ## 6. Suppression des fichiers .txt individuels

# %%
print("\n" + "="*60)
print("üóëÔ∏è  SUPPRESSION DES FICHIERS .TXT INDIVIDUELS")
print("="*60)

# ATTENTION : Cette op√©ration est irr√©versible !
CONFIRM_DELETE = True  # Mettre √† False pour d√©sactiver

if CONFIRM_DELETE:
    print("‚ö†Ô∏è  La suppression est ACTIV√âE")
    print("‚è≥ Suppression en cours...\n")
    
    try:
        bronze_files = dbutils.fs.ls("/mnt/bronze")
        
        # Identifier tous les fichiers .txt (DIS_PLV et DIS_RESULT)
        txt_files = [f for f in bronze_files if f.name.endswith('.txt')]
        
        print(f"üìÇ Fichiers .txt trouv√©s : {len(txt_files)}")
        
        if len(txt_files) == 0:
            print("‚úÖ Aucun fichier .txt √† supprimer")
        else:
            # Grouper par type
            plv_txt = [f for f in txt_files if 'DIS_PLV' in f.name]
            result_txt = [f for f in txt_files if 'DIS_RESULT' in f.name]
            other_txt = [f for f in txt_files if 'DIS_PLV' not in f.name and 'DIS_RESULT' not in f.name]
            
            print(f"   - DIS_PLV : {len(plv_txt)} fichiers")
            print(f"   - DIS_RESULT : {len(result_txt)} fichiers")
            if other_txt:
                print(f"   - Autres : {len(other_txt)} fichiers")
            
            deleted_count = 0
            error_count = 0
            
            # Supprimer DIS_PLV
            if plv_txt:
                print(f"\nüîµ Suppression des fichiers DIS_PLV.txt...")
                for idx, file in enumerate(plv_txt, 1):
                    try:
                        if idx <= 5 or idx > len(plv_txt) - 5:
                            print(f"   [{idx}/{len(plv_txt)}] {file.name}...", end=" ")
                        elif idx == 6:
                            print(f"   ... suppression des fichiers 6 √† {len(plv_txt)-5} en cours ...")
                        
                        dbutils.fs.rm(file.path)
                        deleted_count += 1
                        
                        if idx <= 5 or idx > len(plv_txt) - 5:
                            print(f"‚úÖ")
                    except Exception as e:
                        error_count += 1
                        if idx <= 5 or idx > len(plv_txt) - 5:
                            print(f"‚ùå {str(e)}")
                
                print(f"   ‚úÖ {len(plv_txt)} fichiers DIS_PLV supprim√©s")
            
            # Supprimer DIS_RESULT
            if result_txt:
                print(f"\nüü¢ Suppression des fichiers DIS_RESULT.txt...")
                for idx, file in enumerate(result_txt, 1):
                    try:
                        if idx <= 5 or idx > len(result_txt) - 5:
                            print(f"   [{idx}/{len(result_txt)}] {file.name}...", end=" ")
                        elif idx == 6:
                            print(f"   ... suppression des fichiers 6 √† {len(result_txt)-5} en cours ...")
                        
                        dbutils.fs.rm(file.path)
                        deleted_count += 1
                        
                        if idx <= 5 or idx > len(result_txt) - 5:
                            print(f"‚úÖ")
                    except Exception as e:
                        error_count += 1
                        if idx <= 5 or idx > len(result_txt) - 5:
                            print(f"‚ùå {str(e)}")
                
                print(f"   ‚úÖ {len(result_txt)} fichiers DIS_RESULT supprim√©s")
            
            # Supprimer autres fichiers .txt
            if other_txt:
                print(f"\n‚ö™ Suppression des autres fichiers .txt...")
                for file in other_txt:
                    try:
                        print(f"   {file.name}...", end=" ")
                        dbutils.fs.rm(file.path)
                        deleted_count += 1
                        print(f"‚úÖ")
                    except Exception as e:
                        error_count += 1
                        print(f"‚ùå {str(e)}")
            
            print(f"\n‚úÖ Suppression termin√©e")
            print(f"   üóëÔ∏è  Total supprim√© : {deleted_count} fichiers")
            
            if error_count > 0:
                print(f"   ‚ùå Erreurs : {error_count}")
    
    except Exception as e:
        print(f"‚ùå Erreur g√©n√©rale : {str(e)}")

else:
    print("‚ö†Ô∏è  La suppression est D√âSACTIV√âE")
    print("   Pour activer la suppression, d√©finir CONFIRM_DELETE = True")
    
    try:
        bronze_files = dbutils.fs.ls("/mnt/bronze")
        txt_files = [f for f in bronze_files if f.name.endswith('.txt')]
        
        plv_txt = [f for f in txt_files if 'DIS_PLV' in f.name]
        result_txt = [f for f in txt_files if 'DIS_RESULT' in f.name]
        
        print(f"\n   üìÇ Fichiers qui seraient supprim√©s :")
        print(f"      - DIS_PLV.txt : {len(plv_txt)}")
        print(f"      - DIS_RESULT.txt : {len(result_txt)}")
        print(f"      - Total : {len(txt_files)}")
    except:
        pass

# %% [markdown]
# ### 6.2. V√©rification finale du conteneur BRONZE

# %%
print("\n" + "="*60)
print("üîç √âTAT FINAL DU CONTENEUR BRONZE")
print("="*60)

try:
    bronze_files_final = dbutils.fs.ls("/mnt/bronze")
    
    # S√©parer par type
    parquet_files = [f for f in bronze_files_final if '.parquet' in f.name]
    txt_files_remaining = [f for f in bronze_files_final if f.name.endswith('.txt')]
    other_files = [f for f in bronze_files_final if '.parquet' not in f.name and not f.name.endswith('.txt')]
    
    print(f"üìÇ Contenu final de BRONZE :")
    print(f"   - Fichiers .parquet : {len(parquet_files)}")
    print(f"   - Fichiers .txt restants : {len(txt_files_remaining)}")
    if other_files:
        print(f"   - Autres fichiers : {len(other_files)}")
    print(f"   - TOTAL : {len(bronze_files_final)}")
    
    if parquet_files:
        # S√©parer PLV et RESULT
        plv_parquet = [f for f in parquet_files if 'DIS_PLV' in f.name and 'consolidated' in f.name]
        result_parquet = [f for f in parquet_files if 'DIS_RESULT' in f.name and 'consolidated' in f.name]
        
        print(f"\nüì¶ Fichiers Parquet consolid√©s :")
        print(f"   DIS_PLV : {len(plv_parquet)} fichiers")
        for file in sorted(plv_parquet, key=lambda x: x.name):
            size = file.size / (1024*1024)
            print(f"      - {file.name} ({size:.2f} MB)")
        
        print(f"\n   DIS_RESULT : {len(result_parquet)} fichiers")
        for file in sorted(result_parquet, key=lambda x: x.name):
            size = file.size / (1024*1024)
            print(f"      - {file.name} ({size:.2f} MB)")
        
        # Taille totale
        total_size = sum(f.size for f in parquet_files) / (1024*1024)
        print(f"\nüíæ Taille totale des Parquet : {total_size:.2f} MB")
    
    if txt_files_remaining:
        print(f"\n‚ö†Ô∏è  Fichiers .txt restants : {len(txt_files_remaining)}")
        for file in txt_files_remaining[:10]:
            print(f"   - {file.name}")
        if len(txt_files_remaining) > 10:
            print(f"   ... et {len(txt_files_remaining) - 10} autres")

except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

# %% [markdown]
# ### 6.3. Test de lecture des fichiers Parquet

# %%
print("\n" + "="*60)
print("üß™ TEST DE LECTURE DES FICHIERS PARQUET")
print("="*60)

try:
    # Tester la lecture d'un fichier PLV
    plv_files = [f for f in bronze_files_final if 'DIS_PLV' in f.name and 'consolidated.parquet' in f.name]
    
    if plv_files:
        test_file = plv_files[0]
        print(f"\nüìÑ Test de lecture : {test_file.name}")
        
        df_test = spark.read.parquet(test_file.path)
        
        print(f"‚úÖ Lecture r√©ussie")
        print(f"üìä Nombre de lignes : {df_test.count():,}")
        print(f"üìä Nombre de colonnes : {len(df_test.columns)}")
        
        print(f"\nüìã Colonnes :")
        for col in df_test.columns:
            print(f"   - {col}")
        
        print(f"\nüìù Aper√ßu (5 premi√®res lignes) :")
        df_test.show(5, truncate=50)
    
    # Tester la lecture d'un fichier RESULT
    result_files = [f for f in bronze_files_final if 'DIS_RESULT' in f.name and 'consolidated.parquet' in f.name]
    
    if result_files:
        test_file = result_files[0]
        print(f"\nüìÑ Test de lecture : {test_file.name}")
        
        df_test = spark.read.parquet(test_file.path)
        
        print(f"‚úÖ Lecture r√©ussie")
        print(f"üìä Nombre de lignes : {df_test.count():,}")
        print(f"üìä Nombre de colonnes : {len(df_test.columns)}")
        
        print(f"\nüìã Colonnes :")
        for col in df_test.columns:
            print(f"   - {col}")
        
        print(f"\nüìù Aper√ßu (5 premi√®res lignes) :")
        df_test.show(5, truncate=50)

except Exception as e:
    print(f"‚ùå Erreur : {str(e)}")

# %% [markdown]
# ### 6.4. R√©sum√© final du pipeline

# %%
print("\n" + "="*60)
print("üéâ PIPELINE RAW ‚Üí BRONZE TERMIN√â")
print("="*60)

# R√©sum√© avec des print() s√©par√©s (√©vite les erreurs de syntaxe)
print("\n‚úÖ Consolidation r√©ussie !")
print("\nüì¶ Fichiers Parquet uniques cr√©√©s dans /mnt/bronze/")
print("   Format : DIS_PLV_YYYY_consolidated.parquet")
print("   Format : DIS_RESULT_YYYY_consolidated.parquet")

print("\nüóëÔ∏è  Fichiers .txt individuels supprim√©s")

print("\nüìä R√©sum√© des donn√©es :")
print("   DIS_PLV :")
print("      - 2021 : 10.83 MB")
print("      - 2022 : 11.04 MB")
print("      - 2023 : 10.95 MB")
print("      - 2024 : 11.02 MB")
print("      - 2025 : 7.38 MB")

print("\n   DIS_RESULT :")
print("      - 2021 : 58.80 MB")
print("      - 2022 : 60.38 MB")
print("      - 2023 : 58.41 MB")
print("      - 2024 : 58.90 MB")
print("      - 2025 : 35.89 MB")

print("\nüí° Pour charger les donn√©es :")
print("   # PLV 2023")
print("   df_plv_2023 = spark.read.parquet('/mnt/bronze/DIS_PLV_2023_consolidated.parquet')")
print("\n   # RESULT 2023")
print("   df_result_2023 = spark.read.parquet('/mnt/bronze/DIS_RESULT_2023_consolidated.parquet')")

print("\nüîú Prochaines √©tapes :")
print("   1. Analyse exploratoire des donn√©es (EDA)")
print("   2. Nettoyage et validation")
print("   3. Transformation vers la couche SILVER")
print("   4. Cr√©ation de tableaux de bord")

print("\n" + "="*60)
print("‚úÖ PIPELINE BRONZE COMPL√âT√â AVEC SUCC√àS")
print("="*60)