In [2]:
from pyiceberg.catalog import load_catalog

import pyarrow.fs as fs
import pyarrow.parquet as pq
import tomlkit

In [3]:
toml_path = ".dlt/secrets.toml"

with open(toml_path, "r") as f:
    config = tomlkit.parse(f.read())

minio_config = config["parquet_to_minio"]["destination"]["credentials"]
nessie = config["nessie"]

In [4]:
minio = fs.S3FileSystem(
    access_key = minio_config["aws_access_key_id"],
    secret_key = minio_config["aws_secret_access_key"],
    endpoint_override = minio_config["endpoint_url"]
)

In this case we read the table as a pyarrow table, in case you need to perform other transformations before keep in mind that you can read it with pandas using pyarrow as backend.

In [5]:
path = "taxis/taxis_parquet/df_data/1757366836.193581.689b1c2701.parquet"
table_taxis = pq.read_table(path, filesystem=minio)

In [6]:
arrow_schema = table_taxis.schema
print(arrow_schema)

vendor_id: int32
tpep_pickup_datetime: timestamp[us]
tpep_dropoff_datetime: timestamp[us]
passenger_count: int64
trip_distance: double
ratecode_id: int64
store_and_fwd_flag: large_string
pu_location_id: int32
do_location_id: int32
payment_type: int64
fare_amount: double
extra: double
mta_tax: double
tip_amount: double
tolls_amount: double
improvement_surcharge: double
total_amount: double
congestion_surcharge: double
airport_fee: double
cbd_congestion_fee: double


In [7]:
for file_info in minio.get_file_info([path, "taxis/taxis_parquet/df_data/"]):
    print(file_info)

<FileInfo for 'taxis/taxis_parquet/df_data/1757366836.193581.689b1c2701.parquet': type=FileType.File, size=72092726>
<FileInfo for 'taxis/taxis_parquet/df_data/': type=FileType.Directory>


## Conversión a tabla de Iceberg

In [8]:
catalog = load_catalog(
    "nessie",
    **{
        "uri": nessie["uri"],
        "s3.access-key-id": nessie["access-key-id"],
        "s3.secret-access-key": nessie["secret-access-key"]
    }
)

ConnectionError: HTTPConnectionPool(host='nessie', port=19120): Max retries exceeded with url: /iceberg/main/v1/config (Caused by NameResolutionError("<urllib3.connection.HTTPConnection object at 0x7c7539e117f0>: Failed to resolve 'nessie' ([Errno -3] Temporary failure in name resolution)"))

In [None]:
namespaces = catalog.list_namespaces()
print(namespaces)

[('taxis',)]


In [None]:
try:
    catalog.create_namespace("taxis")
except Exception as e:
    print("El catálogo ya está creado →", e)

El catálogo ya está creado → AlreadyExistsException: Namespace already exists: taxis


In [None]:
namespaces = catalog.list_namespaces()
print(namespaces)

[('taxis',)]


In [10]:
catalog.create_table(
        "taxis.taxis_iceberg", 
        schema=arrow_schema)

NameError: name 'catalog' is not defined

In [58]:
taxis_iceberg = catalog.load_table("taxis.taxis_iceberg")
taxis_iceberg.append(table_taxis)

NoSuchTableError: NoSuchTableException: Table does not exist: taxis.taxis_iceberg

## Consultar los datos

In [20]:
result = taxis_iceberg.scan()
result_arrow = result.to_arrow()
result_df = result_arrow.to_pandas()

In [24]:
result_df.head()

Unnamed: 0,vendor_id,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,ratecode_id,store_and_fwd_flag,pu_location_id,do_location_id,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,cbd_congestion_fee
0,1,2025-01-01 00:18:38,2025-01-01 00:26:59,1.0,1.6,1.0,N,229,237,1,10.0,3.5,0.5,3.0,0.0,1.0,18.0,2.5,0.0,0.0
1,1,2025-01-01 00:32:40,2025-01-01 00:35:13,1.0,0.5,1.0,N,236,237,1,5.1,3.5,0.5,2.02,0.0,1.0,12.12,2.5,0.0,0.0
2,1,2025-01-01 00:44:04,2025-01-01 00:46:01,1.0,0.6,1.0,N,141,141,1,5.1,3.5,0.5,2.0,0.0,1.0,12.1,2.5,0.0,0.0
3,2,2025-01-01 00:14:27,2025-01-01 00:20:01,3.0,0.52,1.0,N,244,244,2,7.2,1.0,0.5,0.0,0.0,1.0,9.7,0.0,0.0,0.0
4,2,2025-01-01 00:21:34,2025-01-01 00:25:06,3.0,0.66,1.0,N,244,116,2,5.8,1.0,0.5,0.0,0.0,1.0,8.3,0.0,0.0,0.0
