# Iceberg writer against local MinIO

Write the ecommerce dataset as an Apache Iceberg table with a SQLite-backed catalog while storing data files in MinIO.

In [None]:
import datetime
from pathlib import Path
import importlib
from dataset_generator import create_generator, create_writer, WriterOptions, write_dataset, S3Config, CatalogConfig
if importlib.util.find_spec("pyiceberg") is None:
    raise RuntimeError("Install dataset-generator[iceberg] before running this notebook.")

## Configure endpoints

Defaults assume the docker-compose MinIO service and an existing bucket named `demo`.

In [None]:
endpoint = "http://localhost:9000"
access_key = "minioadmin"
secret_key = "minioadmin"
bucket = "demo"
prefix = "iceberg"
catalog_db = Path("examples/demo_output/iceberg/catalog.db").resolve()
catalog_db.parent.mkdir(parents=True, exist_ok=True)

## Generate dataset

Create a small ecommerce dataset for the target period.

In [None]:
generator = create_generator("ecommerce",
    seed=33,
    n_customers=400,
    n_products=160,
    orders_per_day=100,
    order_items_mean=2.1,
    start_date=datetime.date(2023, 5, 1),
    end_date=datetime.date(2023, 5, 4),
    file_rows_target=150,
)

## Write Iceberg table

A SQLite-backed SQL catalog tracks table metadata while data files land in MinIO under `s3://demo/iceberg`.

In [None]:
s3_config = S3Config(uri=f"s3://{bucket}", key=access_key, secret=secret_key, endpoint_url=endpoint, region="us-east-1", use_ssl=False)
catalog = CatalogConfig(kind="sqlite", uri=f"sqlite:///{catalog_db}", namespace="demo")
writer = create_writer("iceberg", output_uri=f"s3://{bucket}/{prefix}", s3=s3_config, catalog=catalog, options=WriterOptions(file_rows_target=150))
write_dataset(generator, writer)

## Inspect table metadata

Use PyIceberg to load the table and preview data.

In [None]:
from pyiceberg.catalog.sql import SqlCatalog
catalog = SqlCatalog("demo", uri=f"sqlite:///{catalog_db}", warehouse=f"s3://{bucket}/{prefix}")
table = catalog.load_table("demo.orders")
next(table.scan(limit=5).to_arrow_table()).to_pandas()