### MinIO explained

In [1]:
import boto3
import os

# These come from your docker-compose env vars
aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]     # "admin"
aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]  # "password"
aws_region = os.environ["AWS_REGION"] # us-east-1

# Mocked S3 client that connects to local MinIO
s3 = boto3.client(
    "s3",
    endpoint_url="http://minio:9000",  # Local MinIO service
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=aws_region
)

# Create a bucket
s3.create_bucket(Bucket="poc")

# Create a blob (upload a file)
s3.put_object(Bucket="poc", Key="demo.txt", Body=b"Hello, Iceberg!")

{'ResponseMetadata': {'RequestId': '184A2C598E883D20',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'content-length': '0',
   'etag': '"3af42309382afb590d9143564b4bb8b8"',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-checksum-crc32': 'fW13PQ==',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '184A2C598E883D20',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '758',
   'x-ratelimit-remaining': '758',
   'x-xss-protection': '1; mode=block',
   'date': 'Wed, 18 Jun 2025 15:22:48 GMT'},
  'RetryAttempts': 0},
 'ETag': '"3af42309382afb590d9143564b4bb8b8"',
 'ChecksumCRC32': 'fW13PQ=='}

In [2]:
# List all the blobs

response = s3.list_objects_v2(Bucket="poc")
for obj in response.get("Contents", []):
    print(f"Found object: {obj['Key']}")

Found object: demo.txt


In [3]:
# Read the blob

response = s3.get_object(Bucket="poc", Key="demo.txt")
print(response["Body"].read().decode())

Hello, Iceberg!


In [4]:
# Clean it up

# Delete blob
s3.delete_object(Bucket="poc", Key="demo.txt")

# Delete bucket
s3.delete_bucket(Bucket="poc")

{'ResponseMetadata': {'RequestId': '184A2C5CF7B583B5',
  'HostId': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
  'HTTPStatusCode': 204,
  'HTTPHeaders': {'accept-ranges': 'bytes',
   'server': 'MinIO',
   'strict-transport-security': 'max-age=31536000; includeSubDomains',
   'vary': 'Origin, Accept-Encoding',
   'x-amz-id-2': 'dd9025bab4ad464b049177c95eb6ebf374d3b3fd1af9251148b658df7ac2e3e8',
   'x-amz-request-id': '184A2C5CF7B583B5',
   'x-content-type-options': 'nosniff',
   'x-ratelimit-limit': '758',
   'x-ratelimit-remaining': '758',
   'x-xss-protection': '1; mode=block',
   'date': 'Wed, 18 Jun 2025 15:23:02 GMT'},
  'RetryAttempts': 0}}

### Iceberg time!

In [5]:
import pyarrow as pa
from pyiceberg.catalog import load_rest
from pyiceberg.exceptions import NamespaceAlreadyExistsError, TableAlreadyExistsError

In [6]:
import pyarrow as pa
from pyiceberg.catalog import load_rest
from pyiceberg.exceptions import NamespaceAlreadyExistsError, TableAlreadyExistsError

catalog = load_rest(
    name="rest",
    conf = {
        "uri": "http://rest:8181/",
        "s3.endpoint": "http://minio:9000",
        "s3.access-key": aws_access_key_id,
        "s3.secret-key": aws_secret_access_key
    }
)

In [7]:
from pyiceberg.exceptions import NamespaceAlreadyExistsError

namespace = "rideshare"

try:
    catalog.create_namespace(namespace)
except NamespaceAlreadyExistsError:
    pass  # It's fine if it already exists

In [15]:
namespaces = catalog.list_namespaces()
print("Namespaces:", namespaces)

Namespaces: [('rideshare',)]


In [35]:
# First, imports
from pyiceberg.catalog import load_catalog
from pyiceberg.exceptions import NamespaceAlreadyExistsError
from pyiceberg.schema import Schema
from pyiceberg.table import TableIdentifier
from pyiceberg.partitioning import PartitionSpec
from pyiceberg.types import UUIDType, StringType, TimestampType, DoubleType, NestedField, BooleanType, IntegerType, DecimalType

In [28]:
# Define schema using NestedField

# ❗ Iceberg requires all fields to have stable, explicit IDs.
# This is critical for schema evolution and tracking changes over time.
# That's why we use NestedField() — each field has:
# - field_id: required, stable numeric ID
# - name: field name
# - field_type: Iceberg data type
# - required: whether the field is NOT NULL

rides_schema = Schema(
    NestedField(field_id=1, name="ride_id", field_type=UUIDType(), required=True),
    NestedField(field_id=2, name="driver_id", field_type=StringType(), required=False),
    NestedField(field_id=3, name="customer_id", field_type=StringType(), required=False),
    NestedField(field_id=4, name="pickup_time", field_type=TimestampType(), required=False),
    NestedField(field_id=5, name="dropoff_time", field_type=TimestampType(), required=False),
    NestedField(field_id=6, name="fare", field_type=DoubleType(), required=False),
    NestedField(field_id=7, name="pickup_location", field_type=StringType(), required=False),
    NestedField(field_id=8, name="dropoff_location", field_type=StringType(), required=False)
)

from pyiceberg.partitioning import PartitionSpec

rides_partition_spec = PartitionSpec(
    fields=[
        PartitionField(
            source_id=4,
            field_id=1000,
            transform="identity",
            name="pickup_time"
        )
    ]
)

# Drop if exists (optional)
# try:
#     catalog.drop_table(identifier=f"{namespace}.rides")
# except NoSuchTableError:
#     pass

catalog.create_table(
    identifier=f"{namespace}.rides",
    schema=rides_schema,
    partition_spec=rides_partition_spec
)

rides(
  1: ride_id: required uuid,
  2: driver_id: optional string,
  3: customer_id: optional string,
  4: pickup_time: optional timestamp,
  5: dropoff_time: optional timestamp,
  6: fare: optional double,
  7: pickup_location: optional string,
  8: dropoff_location: optional string
),
partition by: [pickup_time],
sort order: [],
snapshot: null

In [34]:
drivers_schema = Schema(
    NestedField(field_id=1, name="driver_id", field_type=StringType(), required=True),
    NestedField(field_id=2, name="full_name", field_type=StringType(), required=False),
    NestedField(field_id=3, name="city", field_type=StringType(), required=False),
    NestedField(field_id=4, name="active", field_type=BooleanType(), required=False),
    NestedField(field_id=5, name="rating", field_type=IntegerType(), required=False),
    NestedField(field_id=6, name="last_updated", field_type=TimestampType(), required=False)
)

# Partition by city (field_id=3)
drivers_partition_spec = PartitionSpec(
    fields=[
        PartitionField(
            source_id=3,
            field_id=1001,
            transform="identity",
            name="city"
        )
    ]
)

# Create the table
catalog.create_table(
    identifier=f"{namespace}.drivers",
    schema=drivers_schema,
    partition_spec=drivers_partition_spec
)

drivers(
  1: driver_id: required string,
  2: full_name: optional string,
  3: city: optional string,
  4: active: optional boolean,
  5: rating: optional int,
  6: last_updated: optional timestamp
),
partition by: [city],
sort order: [],
snapshot: null

In [36]:
payments_schema = Schema(
    NestedField(field_id=1, name="payment_id", field_type=UUIDType(), required=True),
    NestedField(field_id=2, name="ride_id", field_type=StringType(), required=False),
    NestedField(field_id=3, name="customer_id", field_type=StringType(), required=False),
    NestedField(field_id=4, name="amount", field_type=DecimalType(precision=10, scale=2), required=False),
    NestedField(field_id=5, name="status", field_type=StringType(), required=False),  # e.g. "paid", "refunded"
    NestedField(field_id=6, name="timestamp", field_type=TimestampType(), required=False)
)

# Partition by 'timestamp' (field_id=6)
payments_partition_spec = PartitionSpec(
    fields=[
        PartitionField(
            source_id=6,
            field_id=1002,
            transform="identity",
            name="timestamp"
        )
    ]
)


catalog.create_table(
    identifier=f"{namespace}.payments",
    schema=payments_schema,
    partition_spec=payments_partition_spec
)

payments(
  1: payment_id: required uuid,
  2: ride_id: optional string,
  3: customer_id: optional string,
  4: amount: optional decimal(10, 2),
  5: status: optional string,
  6: timestamp: optional timestamp
),
partition by: [timestamp],
sort order: [],
snapshot: null

In [None]:
import pyarrow as pa
from pyiceberg.catalog import load_rest
from pyiceberg.exceptions import NamespaceAlreadyExistsError, TableAlreadyExistsError
import boto3

aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"] # admin
aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"] # password

catalog = load_rest(
    name="rest",
    conf = {
        "uri": "http://rest:8181/",
        "s3.endpoint": "http://minio:9000",
        "s3.access-key": aws_access_key_id,
        "s3.secret-key": aws_secret_access_key
    }
)

# Create a S3 "mocked" client with iceberg user credentials
s3 = boto3.client(
    "s3",
    endpoint_url="http://minio:9000",  # ✅ Use the container name
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name="us-east-1"
)

namespace = "poc_new"
try:
    catalog.create_namespace(namespace)
except NamespaceAlreadyExistsError as e:
    pass

namespaces = catalog.list_namespaces()
print("Namespaces:", namespaces)

def list_blobs(bucket=None):
    """
    Lists blobs (objects) in a specific S3 bucket or in all buckets.

    Parameters:
        bucket (str, optional): Bucket name. If not provided, lists objects in all buckets.
    """
    if bucket:
        print(f"\nObjects in bucket: {bucket}")
        _print_bucket_objects(bucket)
    else:
        buckets = s3.list_buckets()["Buckets"]
        for b in buckets:
            bucket_name = b["Name"]
            print(f"\nObjects in bucket: {bucket_name}")
            _print_bucket_objects(bucket_name)


def _print_bucket_objects(bucket_name):
    response = s3.list_objects_v2(Bucket=bucket_name)
    if "Contents" in response:
        for obj in response["Contents"]:
            print(f" - {obj['Key']}")
    else:
        print(" (Empty)")

list_blobs()

df = pa.Table.from_pylist(
    [
        {"lat": 52.371807, "long": 4.896029},
        {"lat": 52.387386, "long": 4.646219},
        {"lat": 52.078663, "long": 4.288788},
    ],
)
schema = df.schema

table_name = "coordinates"
table_identifier = f"{namespace}.{table_name}"

try:
    table = catalog.create_table(
        identifier=table_identifier,
        schema=schema,
    )
except TableAlreadyExistsError as e:
    pass

table = catalog.load_table(table_identifier)
table.append(df)

result = table.scan().to_arrow()
print(result)

list_blobs()

In [4]:
# Create a S3 "mocked" client with iceberg user credentials
s3 = boto3.client(
    "s3",
    endpoint_url="http://minio:9000",  # ✅ Use the container name
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name="us-east-1"
)

In [5]:
namespace = "poc_new"
try:
    catalog.create_namespace(namespace)
except NamespaceAlreadyExistsError as e:
    pass

In [6]:
namespaces = catalog.list_namespaces()
print("Namespaces:", namespaces)

Namespaces: [('poc_new',)]


In [7]:
def list_blobs(bucket=None):
    """
    Lists blobs (objects) in a specific S3 bucket or in all buckets.

    Parameters:
        bucket (str, optional): Bucket name. If not provided, lists objects in all buckets.
    """
    if bucket:
        print(f"\nObjects in bucket: {bucket}")
        _print_bucket_objects(bucket)
    else:
        buckets = s3.list_buckets()["Buckets"]
        for b in buckets:
            bucket_name = b["Name"]
            print(f"\nObjects in bucket: {bucket_name}")
            _print_bucket_objects(bucket_name)


def _print_bucket_objects(bucket_name):
    response = s3.list_objects_v2(Bucket=bucket_name)
    if "Contents" in response:
        for obj in response["Contents"]:
            print(f" - {obj['Key']}")
    else:
        print(" (Empty)")


In [8]:
list_blobs()


Objects in bucket: warehouse
 (Empty)


In [9]:
df = pa.Table.from_pylist(
    [
        {"lat": 52.371807, "long": 4.896029},
        {"lat": 52.387386, "long": 4.646219},
        {"lat": 52.078663, "long": 4.288788},
    ],
)
schema = df.schema

table_name = "coordinates"
table_identifier = f"{namespace}.{table_name}"

In [10]:
try:
    table = catalog.create_table(
        identifier=table_identifier,
        schema=schema,
    )
except TableAlreadyExistsError as e:
    pass

In [11]:
table = catalog.load_table(table_identifier)
table.append(df)

In [12]:
result = table.scan().to_arrow()
print(result)

pyarrow.Table
lat: double
long: double
----
lat: [[52.371807,52.387386,52.078663]]
long: [[4.896029,4.646219,4.288788]]




In [13]:
list_blobs()


Objects in bucket: warehouse
 - poc_new/coordinates/data/00000-0-977d2bf6-fc86-443a-bd63-5e7b06caffbd.parquet
 - poc_new/coordinates/metadata/00000-ab97f938-d7c6-4d14-8142-eb88f3da9569.metadata.json
 - poc_new/coordinates/metadata/00001-565668f6-fe20-4ee2-98f6-0fc10bba87c7.metadata.json
 - poc_new/coordinates/metadata/977d2bf6-fc86-443a-bd63-5e7b06caffbd-m0.avro
 - poc_new/coordinates/metadata/snap-1981489265837032690-0-977d2bf6-fc86-443a-bd63-5e7b06caffbd.avro
