# Apache Iceberg Internals with PyIceberg (No Spark)
This notebook demonstrates Catalog, Metadata, Manifest List, Manifest (Parquet), and Data access using **PyIceberg + Nessie + MinIO**.

## Requirements
```text
pyiceberg>=0.6.0
pyarrow>=14.0.0
fastavro>=1.9.0
fsspec>=2024.3.0
s3fs>=2024.3.0
jupyterlab>=4.0.0
```

In [None]:
from pyiceberg.catalog import load_catalog
from pyiceberg.schema import Schema
from pyiceberg.types import LongType, StringType
from pyiceberg.partitioning import PartitionSpec
from pyiceberg.expressions import AlwaysTrue
from pyiceberg.io.pyarrow import write_table, read_table

import pyarrow as pa
import json
import fsspec
from fastavro import reader
import pyarrow.parquet as pq

## Connect to Nessie Catalog

In [None]:
catalog = load_catalog(
    "nessie",
    **{
        "uri": "http://nessie:19120/api/v1",
        "warehouse": "s3://warehouse",
        "s3.endpoint": "http://minio:9000",
        "s3.access-key-id": "minioadmin",
        "s3.secret-access-key": "minioadmin",
        "s3.path-style-access": "true",
    }
)

catalog.list_namespaces()

## Create Namespace and Table

In [None]:
catalog.create_namespace_if_not_exists("demo")

schema = Schema(
    (1, "trade_id", LongType(), False),
    (2, "symbol", StringType(), False),
    (3, "business_date", StringType(), False)
)

spec = PartitionSpec.builder_for(schema) \
    .identity("business_date") \
    .build()

catalog.create_table(
    identifier="demo.trades",
    schema=schema,
    partition_spec=spec
)

## Write Data (PyArrow, No Spark)

In [None]:
arrow_table = pa.Table.from_pylist([
    {"trade_id": 1, "symbol": "AAPL", "business_date": "2025-12-01"},
    {"trade_id": 2, "symbol": "GOOG", "business_date": "2025-12-01"},
    {"trade_id": 3, "symbol": "MSFT", "business_date": "2025-12-02"}
])

table = catalog.load_table("demo.trades")
write_table(table, arrow_table)

## Inspect Metadata JSON

In [None]:
fs = fsspec.filesystem(
    "s3",
    client_kwargs={"endpoint_url": "http://minio:9000"},
    key="minioadmin",
    secret="minioadmin"
)

with fs.open(table.metadata_location) as f:
    metadata = json.load(f)

metadata.keys()

## Read Manifest List (Avro)

In [None]:
snapshot = metadata["snapshots"][0]
with fs.open(snapshot["manifest-list"], "rb") as f:
    manifest_list = list(reader(f))

manifest_list

## Inspect Manifest File (Parquet – metadata only)

In [None]:
manifest_path = manifest_list[0]["manifest_path"]
with fs.open(manifest_path, "rb") as f:
    manifest_table = pq.read_table(f)

manifest_table.schema

## Iceberg Scan Planning API

In [None]:
scan = table.scan()
for task in scan.plan_files():
    df = task.file
    print(df.file_path, df.partition, df.record_count)

## Read Data via Iceberg API (Correct Way)

In [None]:
result = read_table(table, row_filter=AlwaysTrue())
result.to_pandas()