### MinIO explained

In [None]:
import boto3
import os

# These come from your docker-compose env vars
aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"]     # "admin"
aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"]  # "password"
aws_region = os.environ["AWS_REGION"] # us-east-1

# Mocked S3 client that connects to local MinIO
s3 = boto3.client(
    "s3",
    endpoint_url="http://minio:9000",  # Local MinIO service
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name=aws_region
)

# Create a bucket
s3.create_bucket(Bucket="poc")

# Create a blob (upload a file)
s3.put_object(Bucket="poc", Key="demo.txt", Body=b"Hello, Iceberg!")

In [19]:
# List all the blobs

esponse = s3.list_objects_v2(Bucket="poc")
for obj in response.get("Contents", []):
    print(f"Found object: {obj['Key']}")

In [20]:
# Read the blob

response = s3.get_object(Bucket="poc", Key="demo.txt")
print(response["Body"].read().decode())

Hello, Iceberg!


In [None]:
# Clean it up

# Delete blob
s3.delete_object(Bucket="poc", Key="demo.txt")

# Delete bucket
s3.delete_bucket(Bucket="poc")

### Iceberg time!

In [34]:
import pyarrow as pa
from pyiceberg.catalog import load_rest
from pyiceberg.exceptions import NamespaceAlreadyExistsError, TableAlreadyExistsError

In [3]:
import pyarrow as pa
from pyiceberg.catalog import load_rest
from pyiceberg.exceptions import NamespaceAlreadyExistsError, TableAlreadyExistsError

catalog = load_rest(
    name="rest",
    conf = {
        "s3.endpoint": "http://minio:9000",
        "s3.access-key": aws_access_key_id,
        "s3.secret-key": aws_secret_access_key
    }
)

In [51]:
from pyiceberg.catalog import load_catalog

catalog = load_catalog(
    name="rest1sdas",
    **{
        "uri": "http://rest:8181/",
        "s3.endpoint": "http://minio:9000",
        "s3.access-key": aws_access_key_id,
        "s3.secret-key": aws_secret_access_key
    }
    


)

In [49]:
from pyiceberg.catalog import load_catalog

catalog = load_catalog(
    name="rest1sdas",
    **{
        "uri": "http://rest:8181/",
        "s3.endpoint": "http://minio:9000",
        "s3.access-key": aws_access_key_id,
        "s3.secret-key": aws_secret_access_key
    }
    


)

namespace = "pocasdsxss_sssnew"
try:
    catalog.create_namespace(namespace)
except NamespaceAlreadyExistsError as e:
    pass

namespaces = catalog.list_namespaces()
print("Namespaces:", namespaces)

df = pa.Table.from_pylist(
    [
        {"lat": 52.371807, "long": 4.896029},
        {"lat": 52.387386, "long": 4.646219},
        {"lat": 52.078663, "long": 4.288788},
    ],
)
schema = df.schema

table_name = "coordinates"
table_identifier = f"{namespace}.{table_name}"

try:
    table = catalog.create_table(
        identifier=table_identifier,
        schema=schema,
    )
except TableAlreadyExistsError as e:
    pass

table = catalog.load_table(table_identifier)
table.append(df)

Namespaces: [('poc_new',), ('pocs_new',), ('pocss_new',), ('pocss_snew',), ('pocsxs_snew',), ('pocsxss_snew',), ('pocsxss_ssnew',), ('pocsxss_sssnew',), ('pocasdsxss_sssnew',)]


In [52]:
namespace = "pizza_shop"
try:
    catalog.create_namespace(namespace)
except NamespaceAlreadyExistsError as e:
    pass

namespaces = catalog.list_namespaces()
print("Namespaces:", namespaces)

Namespaces: [('poc_new',), ('pocasdsxss_sssnew',), ('pocs_new',), ('pocss_new',), ('pocss_snew',), ('pocsxs_snew',), ('pocsxss_snew',), ('pocsxss_ssnew',), ('pocsxss_sssnew',), ('pizza_shop',)]


In [None]:
import pyarrow as pa
from pyiceberg.catalog import load_rest
from pyiceberg.exceptions import NamespaceAlreadyExistsError, TableAlreadyExistsError
import boto3

aws_access_key_id = os.environ["AWS_ACCESS_KEY_ID"] # admin
aws_secret_access_key = os.environ["AWS_SECRET_ACCESS_KEY"] # password

catalog = load_rest(
    name="rest",
    conf = {
        "uri": "http://rest:8181/",
        "s3.endpoint": "http://minio:9000",
        "s3.access-key": aws_access_key_id,
        "s3.secret-key": aws_secret_access_key
    }
)

# Create a S3 "mocked" client with iceberg user credentials
s3 = boto3.client(
    "s3",
    endpoint_url="http://minio:9000",  # ✅ Use the container name
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name="us-east-1"
)

namespace = "poc_new"
try:
    catalog.create_namespace(namespace)
except NamespaceAlreadyExistsError as e:
    pass

namespaces = catalog.list_namespaces()
print("Namespaces:", namespaces)

def list_blobs(bucket=None):
    """
    Lists blobs (objects) in a specific S3 bucket or in all buckets.

    Parameters:
        bucket (str, optional): Bucket name. If not provided, lists objects in all buckets.
    """
    if bucket:
        print(f"\nObjects in bucket: {bucket}")
        _print_bucket_objects(bucket)
    else:
        buckets = s3.list_buckets()["Buckets"]
        for b in buckets:
            bucket_name = b["Name"]
            print(f"\nObjects in bucket: {bucket_name}")
            _print_bucket_objects(bucket_name)


def _print_bucket_objects(bucket_name):
    response = s3.list_objects_v2(Bucket=bucket_name)
    if "Contents" in response:
        for obj in response["Contents"]:
            print(f" - {obj['Key']}")
    else:
        print(" (Empty)")

list_blobs()

df = pa.Table.from_pylist(
    [
        {"lat": 52.371807, "long": 4.896029},
        {"lat": 52.387386, "long": 4.646219},
        {"lat": 52.078663, "long": 4.288788},
    ],
)
schema = df.schema

table_name = "coordinates"
table_identifier = f"{namespace}.{table_name}"

try:
    table = catalog.create_table(
        identifier=table_identifier,
        schema=schema,
    )
except TableAlreadyExistsError as e:
    pass

table = catalog.load_table(table_identifier)
table.append(df)

result = table.scan().to_arrow()
print(result)

list_blobs()

In [4]:
# Create a S3 "mocked" client with iceberg user credentials
s3 = boto3.client(
    "s3",
    endpoint_url="http://minio:9000",  # ✅ Use the container name
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
    region_name="us-east-1"
)

In [58]:
namespace = "pizza_shop"
try:
    catalog.create_namespace(namespace)
except NamespaceAlreadyExistsError as e:
    pass

In [59]:
namespaces = catalog.list_namespaces()
print("Namespaces:", namespaces)

Namespaces: [('pizza_shop',)]


In [57]:
from pyiceberg.exceptions import NamespaceNotEmptyError, NoSuchNamespaceError

namespaces = catalog.list_namespaces()
print("Found namespaces:", namespaces)

for ns in namespaces:
    try:
        tables = catalog.list_tables(namespace=ns)
        for table_identifier in tables:
            print(f"Dropping table: {table_identifier}")
            catalog.drop_table(table_identifier)
        
        # Now try dropping the namespace
        catalog.drop_namespace(ns)
        print(f"Dropped namespace: {ns}")
    except NoSuchNamespaceError:
        print(f"Namespace {ns} no longer exists.")
    except NamespaceNotEmptyError:
        print(f"Namespace {ns} is still not empty. Skipping.")

Found namespaces: [('poc_new',), ('pocasdsxss_sssnew',), ('pocs_new',), ('pocss_new',), ('pocss_snew',), ('pocsxs_snew',), ('pocsxss_snew',), ('pocsxss_ssnew',), ('pocsxss_sssnew',), ('pizza_shop',)]
Dropping table: ('poc_new', 'coordinates')
Dropped namespace: ('poc_new',)
Dropping table: ('pocasdsxss_sssnew', 'coordinates')
Dropped namespace: ('pocasdsxss_sssnew',)
Dropping table: ('pocs_new', 'coordinates')
Dropped namespace: ('pocs_new',)
Dropping table: ('pocss_new', 'coordinates')
Dropped namespace: ('pocss_new',)
Dropping table: ('pocss_snew', 'coordinates')
Dropped namespace: ('pocss_snew',)
Dropping table: ('pocsxs_snew', 'coordinates')
Dropped namespace: ('pocsxs_snew',)
Dropping table: ('pocsxss_snew', 'coordinates')
Dropped namespace: ('pocsxss_snew',)
Dropping table: ('pocsxss_ssnew', 'coordinates')
Dropped namespace: ('pocsxss_ssnew',)
Dropping table: ('pocsxss_sssnew', 'coordinates')
Dropped namespace: ('pocsxss_sssnew',)
Dropped namespace: ('pizza_shop',)


In [53]:
def delete_all_blobs():
    """
    Deletes all objects from all buckets in the current S3 (MinIO) instance.
    """
    buckets = s3.list_buckets()["Buckets"]
    for b in buckets:
        bucket_name = b["Name"]
        print(f"\nDeleting objects in bucket: {bucket_name}")
        response = s3.list_objects_v2(Bucket=bucket_name)
        if "Contents" in response:
            for obj in response["Contents"]:
                print(f" - Deleting: {obj['Key']}")
                s3.delete_object(Bucket=bucket_name, Key=obj["Key"])
        else:
            print(" (Bucket is already empty)")

In [54]:
delete_all_blobs()


Deleting objects in bucket: warehouse
 - Deleting: pocasdsxss_sssnew/coordinates/data/00000-0-32016842-8827-4804-9f12-ee7653d89d72.parquet
 - Deleting: pocasdsxss_sssnew/coordinates/metadata/00000-7de2ee01-77d3-45c7-98d4-0a730ccbc922.metadata.json
 - Deleting: pocasdsxss_sssnew/coordinates/metadata/00001-7b8f7ebb-598e-4b54-8c39-709a3da062d5.metadata.json
 - Deleting: pocasdsxss_sssnew/coordinates/metadata/32016842-8827-4804-9f12-ee7653d89d72-m0.avro
 - Deleting: pocasdsxss_sssnew/coordinates/metadata/snap-6110351927004336182-0-32016842-8827-4804-9f12-ee7653d89d72.avro
 - Deleting: pocs_new/coordinates/metadata/00000-64865882-0b68-4ea0-ba3c-4fd5b59cbbf1.metadata.json
 - Deleting: pocss_new/coordinates/metadata/00000-82739395-2e23-4341-a86c-49867c4500e9.metadata.json
 - Deleting: pocss_snew/coordinates/metadata/00000-bdce2a96-3147-4bf0-b45f-f55cda2d2778.metadata.json
 - Deleting: pocsxs_snew/coordinates/metadata/00000-d6dc602f-deab-473d-9d3c-c43ec13961f0.metadata.json
 - Deleting: pocsx

In [7]:
def list_blobs(bucket=None):
    """
    Lists blobs (objects) in a specific S3 bucket or in all buckets.

    Parameters:
        bucket (str, optional): Bucket name. If not provided, lists objects in all buckets.
    """
    if bucket:
        print(f"\nObjects in bucket: {bucket}")
        _print_bucket_objects(bucket)
    else:
        buckets = s3.list_buckets()["Buckets"]
        for b in buckets:
            bucket_name = b["Name"]
            print(f"\nObjects in bucket: {bucket_name}")
            _print_bucket_objects(bucket_name)


def _print_bucket_objects(bucket_name):
    response = s3.list_objects_v2(Bucket=bucket_name)
    if "Contents" in response:
        for obj in response["Contents"]:
            print(f" - {obj['Key']}")
    else:
        print(" (Empty)")


In [8]:
list_blobs()


Objects in bucket: warehouse
 (Empty)


In [9]:
df = pa.Table.from_pylist(
    [
        {"lat": 52.371807, "long": 4.896029},
        {"lat": 52.387386, "long": 4.646219},
        {"lat": 52.078663, "long": 4.288788},
    ],
)
schema = df.schema

table_name = "coordinates"
table_identifier = f"{namespace}.{table_name}"

In [10]:
try:
    table = catalog.create_table(
        identifier=table_identifier,
        schema=schema,
    )
except TableAlreadyExistsError as e:
    pass

In [11]:
table = catalog.load_table(table_identifier)
table.append(df)

In [12]:
result = table.scan().to_arrow()
print(result)

pyarrow.Table
lat: double
long: double
----
lat: [[52.371807,52.387386,52.078663]]
long: [[4.896029,4.646219,4.288788]]




In [13]:
list_blobs()


Objects in bucket: warehouse
 - poc_new/coordinates/data/00000-0-977d2bf6-fc86-443a-bd63-5e7b06caffbd.parquet
 - poc_new/coordinates/metadata/00000-ab97f938-d7c6-4d14-8142-eb88f3da9569.metadata.json
 - poc_new/coordinates/metadata/00001-565668f6-fe20-4ee2-98f6-0fc10bba87c7.metadata.json
 - poc_new/coordinates/metadata/977d2bf6-fc86-443a-bd63-5e7b06caffbd-m0.avro
 - poc_new/coordinates/metadata/snap-1981489265837032690-0-977d2bf6-fc86-443a-bd63-5e7b06caffbd.avro
