# Apache Iceberg with Nessie Catalog

This notebook demonstrates how to work with Apache Iceberg tables using Nessie as the catalog.

## 1. Install Required Libraries

In [None]:
!pip install -q 'pyiceberg[s3fs,pyarrow]' pynessie pandas

## 2. Configure Nessie Catalog

In [None]:
from pyiceberg.catalog import load_catalog

catalog = load_catalog(
    "nessie",
    **{
        "uri": "http://nessie:19120/api/v2",
        "warehouse": "s3://warehouse/",
        "s3.endpoint": "http://minio:9000",
        "s3.access-key-id": "admin",
        "s3.secret-access-key": "password123",
        "s3.path-style-access": "true",
    }
)

print("âœ“ Connected to Nessie catalog")

## 3. List Namespaces and Tables

In [None]:
# List namespaces
namespaces = catalog.list_namespaces()
print("Namespaces:")
for ns in namespaces:
    print(f"  - {ns}")

# List tables in demo namespace
if ('demo',) in namespaces:
    tables = catalog.list_tables('demo')
    print("\nTables in demo namespace:")
    for table in tables:
        print(f"  - {table}")

## 4. Query Iceberg Table

In [None]:
import pandas as pd

# Load table
table = catalog.load_table("demo.orders")

# Read data
df = table.scan().to_pandas()
print(f"Total orders: {len(df)}")
print("\nSample data:")
df.head(10)

## 5. View Table Schema

In [None]:
# Get schema
schema = table.schema()
print("Table Schema:")
for field in schema.fields:
    print(f"  - {field.name}: {field.field_type}")

## 6. Table Statistics

In [None]:
# Get table metadata
print(f"Table location: {table.location()}")
print(f"Current snapshot: {table.current_snapshot()}")
print(f"\nTable properties:")
for key, value in table.properties.items():
    print(f"  {key}: {value}")

## 7. Data Analysis with Iceberg

In [None]:
# Analyze orders
print("Order Statistics:")
print(f"Total Orders: {len(df)}")
print(f"Total Revenue: ${df['total_amount'].sum():.2f}")
print(f"Average Order Value: ${df['total_amount'].mean():.2f}")

print("\nTop 5 Products by Revenue:")
top_products = df.groupby('product_name')['total_amount'].sum().sort_values(ascending=False).head(5)
print(top_products)

## 8. Filter Data Using Iceberg Predicates

In [None]:
# Filter for delivered orders
delivered_orders = table.scan(
    row_filter="status = 'delivered'"
).to_pandas()

print(f"Delivered orders: {len(delivered_orders)}")
print(f"Delivered revenue: ${delivered_orders['total_amount'].sum():.2f}")

## 9. Working with Nessie Branches (Git-like versioning)

In [None]:
from pynessie import init

# Connect to Nessie
nessie = init("http://nessie:19120/api/v2")

# List branches
branches = nessie.list_references()
print("Nessie branches:")
for branch in branches.references:
    print(f"  - {branch.name}")