In [None]:
from azure.storage.blob import BlobServiceClient
from dlt.sources.filesystem  import readers

import dlt
import os

In [None]:
# Configura tu pipeline
pipeline = dlt.pipeline(
    pipeline_name="minio_to_azure",
    destination="filesystem",
    dataset_name="taxis_parquet_data",
)

In [None]:
parquet_reader = readers(
    bucket_url="s3://my-bucket/taxis/taxis_iceberg_5128d68f-aa5c-4270-a561-d530f60229b4/data",
    file_glob="*.parquet"
).read_parquet()

parquet_reader = parquet_reader.with_name("df_parquet")

In [None]:
# Ejecuta el pipeline
load_info = pipeline.run(
    parquet_reader,
    loader_file_format="parquet",
    write_disposition="replace"
)

print(load_info)
print(pipeline.last_trace.last_normalize_info)

Pipeline minio_to_azure load step completed in 3 minutes and 16.32 seconds
1 load package(s) were loaded to destination filesystem and into dataset dimelo_flow
The filesystem destination used abfss://clase-4-dlt@fhbd.dfs.core.windows.net/GRUPO_6 location to store data
Load package 1757139071.3071916 is LOADED and contains no failed jobs
Normalized data for the following tables:
- df_parquet: 3475226 row(s)
- _dlt_pipeline_state: 1 row(s)

Load package 1757139071.3071916 is NORMALIZED and NOT YET LOADED to the destination and contains no failed jobs


## Confirmar subida del archivo en bucket de Azure

In [None]:
# Configuración desde variables de entorno
ACCOUNT_NAME = os.getenv("AZURE_STORAGE_ACCOUNT_NAME")
ACCOUNT_KEY = os.getenv("AZURE_STORAGE_ACCOUNT_KEY")
CONTAINER_NAME = os.getenv("AZURE_CONTAINER_NAME")

def list_azure_blobs():
    """Lista todos los blobs del contenedor"""
    try:
        # Crear cliente
        blob_service_client = BlobServiceClient(
            account_url=f"https://{ACCOUNT_NAME}.blob.core.windows.net",
            credential=ACCOUNT_KEY
        )
        
        # Obtener contenedor
        container_client = blob_service_client.get_container_client(CONTAINER_NAME)
        
        # Listar blobs
        print(f"Objetos en el contenedor '{CONTAINER_NAME}':")
        print("-" * 50)
        
        for i, blob in enumerate(container_client.list_blobs(), 1):
            size_mb = blob.size / (1024 * 1024)
            print(f"{i:3d}. {blob.name} ({size_mb:.2f} MB)")
        
    except Exception as e:
        print(f"Error: {e}")



def delete_dimelo_flow():
    """Elimina todos los blobs dentro de GRUPO_6/dimelo_flow/ y luego los directorios"""
    try:
        blob_service_client = BlobServiceClient(
            account_url=f"https://{ACCOUNT_NAME}.blob.core.windows.net",
            credential=ACCOUNT_KEY
        )
        container_client = blob_service_client.get_container_client(CONTAINER_NAME)

        prefix = "GRUPO_6/dimelo_flow/"
        blobs_to_delete = list(container_client.list_blobs(name_starts_with=prefix))

        deleted = 0
        for blob in blobs_to_delete:
            print(f"Eliminando blob: {blob.name}")
            container_client.delete_blob(blob.name)
            deleted += 1

        print(f"✅ Eliminados {deleted} blobs bajo {prefix}")

        # Intentar borrar el directorio si existe (solo si usas ADLS Gen2 con HNS)
        try:
            from azure.storage.filedatalake import DataLakeServiceClient

            dl_service = DataLakeServiceClient(
                account_url=f"https://{ACCOUNT_NAME}.dfs.core.windows.net",
                credential=ACCOUNT_KEY
            )
            file_system_client = dl_service.get_file_system_client(CONTAINER_NAME)

            # Borrar la carpeta raíz
            dir_client = file_system_client.get_directory_client(prefix.rstrip("/"))
            dir_client.delete_directory()
            print(f"✅ Carpeta {prefix} eliminada")
        except ImportError:
            print("ℹ️ Solo se borraron los blobs (no el directorio). Instala azure-storage-file-datalake si quieres borrar carpetas con HNS.")

    except Exception as e:
        print(f"Error al eliminar: {e}")


In [18]:
list_azure_blobs()


Objetos en el contenedor 'clase-4-dlt':
--------------------------------------------------
  1. GRUPO_1 (0.00 MB)
  2. GRUPO_1/taxis_parquet (0.00 MB)
  3. GRUPO_1/taxis_parquet/_dlt_loads (0.00 MB)
  4. GRUPO_1/taxis_parquet/_dlt_loads/s3_to_adls__1757702839.136807.jsonl (0.00 MB)
  5. GRUPO_1/taxis_parquet/_dlt_loads/s3_to_adls__1757720850.1673822.jsonl (0.00 MB)
  6. GRUPO_1/taxis_parquet/_dlt_pipeline_state (0.00 MB)
  7. GRUPO_1/taxis_parquet/_dlt_pipeline_state/s3_to_adls__1757702839.136807__b04e1cc0d4556f1be45152bbf8b5d0d5847d9aa52a60eaed4c438141a24c0c7c.jsonl (0.00 MB)
  8. GRUPO_1/taxis_parquet/_dlt_version (0.00 MB)
  9. GRUPO_1/taxis_parquet/_dlt_version/s3_to_adls__1757702843.589799__b04e1cc0d4556f1be45152bbf8b5d0d5847d9aa52a60eaed4c438141a24c0c7c.jsonl (0.00 MB)
 10. GRUPO_1/taxis_parquet/_dlt_version/s3_to_adls__1757721258.7429767__5d45807d5aae2793a12fa1caf5e477d22cd3f3e13e05f4b5f20e6bee79451e32.jsonl (0.01 MB)
 11. GRUPO_1/taxis_parquet/df_parquet (0.00 MB)
 12. GRUPO_1/