### MinIO explained

In [None]:
import boto3
import os

# These come from your docker-compose env vars
aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]     # "admin"
aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]  # "password"
aws_region = os.environ["AWS_REGION"] # us-east-1

# Mocked S3 client that connects to local MinIO
s3 = boto3.client(
    "s3",
    endpoint_url="http://minio:9000",  # Local MinIO service
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=aws_region
)

# Create a bucket
s3.create_bucket(Bucket="poc")

# Create a blob (upload a file)
s3.put_object(Bucket="poc", Key="demo.txt", Body=b"Hello, Iceberg!")

In [19]:
# List all the blobs

response = s3.list_objects_v2(Bucket="poc")
for obj in response.get("Contents", []):
    print(f"Found object: {obj['Key']}")

In [20]:
# Read the blob

response = s3.get_object(Bucket="poc", Key="demo.txt")
print(response["Body"].read().decode())

Hello, Iceberg!


In [None]:
# Clean it up

# Delete blob
s3.delete_object(Bucket="poc", Key="demo.txt")

# Delete bucket
s3.delete_bucket(Bucket="poc")

### Iceberg time!

In [1]:
import pyarrow as pa
from pyiceberg.catalog import load_rest
from pyiceberg.exceptions import NamespaceAlreadyExistsError, TableAlreadyExistsError

In [3]:
import pyarrow as pa
from pyiceberg.catalog import load_rest
from pyiceberg.exceptions import NamespaceAlreadyExistsError, TableAlreadyExistsError

catalog = load_rest(
    name="rest",
    conf = {
        "uri": "http://rest:8181/",
        "s3.endpoint": "http://minio:9000",
        "s3.access-key": aws_access_key_id,
        "s3.secret-key": aws_secret_access_key
    }
)

In [None]:
import pyarrow as pa
from pyiceberg.catalog import load_rest
from pyiceberg.exceptions import NamespaceAlreadyExistsError, TableAlreadyExistsError
import boto3

aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"] # admin
aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"] # password

catalog = load_rest(
    name="rest",
    conf = {
        "uri": "http://rest:8181/",
        "s3.endpoint": "http://minio:9000",
        "s3.access-key": aws_access_key_id,
        "s3.secret-key": aws_secret_access_key
    }
)

# Create a S3 "mocked" client with iceberg user credentials
s3 = boto3.client(
    "s3",
    endpoint_url="http://minio:9000",  # ✅ Use the container name
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name="us-east-1"
)

namespace = "poc_new"
try:
    catalog.create_namespace(namespace)
except NamespaceAlreadyExistsError as e:
    pass

namespaces = catalog.list_namespaces()
print("Namespaces:", namespaces)

def list_blobs(bucket=None):
    """
    Lists blobs (objects) in a specific S3 bucket or in all buckets.

    Parameters:
        bucket (str, optional): Bucket name. If not provided, lists objects in all buckets.
    """
    if bucket:
        print(f"\nObjects in bucket: {bucket}")
        _print_bucket_objects(bucket)
    else:
        buckets = s3.list_buckets()["Buckets"]
        for b in buckets:
            bucket_name = b["Name"]
            print(f"\nObjects in bucket: {bucket_name}")
            _print_bucket_objects(bucket_name)


def _print_bucket_objects(bucket_name):
    response = s3.list_objects_v2(Bucket=bucket_name)
    if "Contents" in response:
        for obj in response["Contents"]:
            print(f" - {obj['Key']}")
    else:
        print(" (Empty)")

list_blobs()

df = pa.Table.from_pylist(
    [
        {"lat": 52.371807, "long": 4.896029},
        {"lat": 52.387386, "long": 4.646219},
        {"lat": 52.078663, "long": 4.288788},
    ],
)
schema = df.schema

table_name = "coordinates"
table_identifier = f"{namespace}.{table_name}"

try:
    table = catalog.create_table(
        identifier=table_identifier,
        schema=schema,
    )
except TableAlreadyExistsError as e:
    pass

table = catalog.load_table(table_identifier)
table.append(df)

result = table.scan().to_arrow()
print(result)

list_blobs()

In [4]:
# Create a S3 "mocked" client with iceberg user credentials
s3 = boto3.client(
    "s3",
    endpoint_url="http://minio:9000",  # ✅ Use the container name
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name="us-east-1"
)

In [5]:
namespace = "poc_new"
try:
    catalog.create_namespace(namespace)
except NamespaceAlreadyExistsError as e:
    pass

In [6]:
namespaces = catalog.list_namespaces()
print("Namespaces:", namespaces)

Namespaces: [('poc_new',)]


In [7]:
def list_blobs(bucket=None):
    """
    Lists blobs (objects) in a specific S3 bucket or in all buckets.

    Parameters:
        bucket (str, optional): Bucket name. If not provided, lists objects in all buckets.
    """
    if bucket:
        print(f"\nObjects in bucket: {bucket}")
        _print_bucket_objects(bucket)
    else:
        buckets = s3.list_buckets()["Buckets"]
        for b in buckets:
            bucket_name = b["Name"]
            print(f"\nObjects in bucket: {bucket_name}")
            _print_bucket_objects(bucket_name)


def _print_bucket_objects(bucket_name):
    response = s3.list_objects_v2(Bucket=bucket_name)
    if "Contents" in response:
        for obj in response["Contents"]:
            print(f" - {obj['Key']}")
    else:
        print(" (Empty)")


In [8]:
list_blobs()


Objects in bucket: warehouse
 (Empty)


In [9]:
df = pa.Table.from_pylist(
    [
        {"lat": 52.371807, "long": 4.896029},
        {"lat": 52.387386, "long": 4.646219},
        {"lat": 52.078663, "long": 4.288788},
    ],
)
schema = df.schema

table_name = "coordinates"
table_identifier = f"{namespace}.{table_name}"

In [10]:
try:
    table = catalog.create_table(
        identifier=table_identifier,
        schema=schema,
    )
except TableAlreadyExistsError as e:
    pass

In [11]:
table = catalog.load_table(table_identifier)
table.append(df)

In [12]:
result = table.scan().to_arrow()
print(result)

pyarrow.Table
lat: double
long: double
----
lat: [[52.371807,52.387386,52.078663]]
long: [[4.896029,4.646219,4.288788]]




In [13]:
list_blobs()


Objects in bucket: warehouse
 - poc_new/coordinates/data/00000-0-977d2bf6-fc86-443a-bd63-5e7b06caffbd.parquet
 - poc_new/coordinates/metadata/00000-ab97f938-d7c6-4d14-8142-eb88f3da9569.metadata.json
 - poc_new/coordinates/metadata/00001-565668f6-fe20-4ee2-98f6-0fc10bba87c7.metadata.json
 - poc_new/coordinates/metadata/977d2bf6-fc86-443a-bd63-5e7b06caffbd-m0.avro
 - poc_new/coordinates/metadata/snap-1981489265837032690-0-977d2bf6-fc86-443a-bd63-5e7b06caffbd.avro
