In [1]:
from pyiceberg.catalog.rest import RestCatalog
import polars as pl
import s3fs
import trino
import json
import gzip
from fsspec import AbstractFileSystem
from pyiceberg.table import Table
from typing import Any

In [2]:
columns = ["transaction_id", "price", "date_of_transfer", "postcode", "property_type", "new_property", "duration", "paon", "saon", "street", "locality", "town", "district", "county", "ppd_category_type", "record_status"]

In [3]:
house_prices = pl.scan_csv("data/pp-2024.csv", has_header=False, new_columns=columns, try_parse_dates=True).collect()

In [10]:
house_prices.sample(5)

transaction_id,price,date_of_transfer,postcode,property_type,new_property,duration,paon,saon,street,locality,town,district,county,ppd_category_type,record_status
str,i64,datetime[μs],str,str,str,str,str,str,str,str,str,str,str,str,str
"""{2D4D7609-30B7-BDF9-E063-4804A…",335000,2024-07-10 00:00:00,"""B15 3BA""","""F""","""N""","""L""","""CLAREMONT VIEW, 11""","""FLAT 17""","""CLAREMONT GARDENS""","""EDGBASTON""","""BIRMINGHAM""","""BIRMINGHAM""","""WEST MIDLANDS""","""B""","""A"""
"""{2ACACE8C-A6DA-295E-E063-4804A…",300000,2024-11-18 00:00:00,"""PL26 8TF""","""S""","""N""","""F""","""6""","""""","""TELEPHONE LANE""","""STENALEES""","""ST AUSTELL""","""CORNWALL""","""CORNWALL""","""A""","""A"""
"""{1EAE3DF6-CA1C-9EB1-E063-4704A…",525000,2024-03-28 00:00:00,"""EN4 8UU""","""S""","""N""","""F""","""53""","""""","""JACKSON ROAD""","""""","""BARNET""","""BARNET""","""GREATER LONDON""","""A""","""A"""
"""{2F7F2B43-4CB4-E08F-E063-4804A…",165000,2024-04-26 00:00:00,"""NN4 5DX""","""F""","""N""","""L""","""19""","""""","""DAINTY GROVE""","""GRANGE PARK""","""NORTHAMPTON""","""WEST NORTHAMPTONSHIRE""","""WEST NORTHAMPTONSHIRE""","""A""","""A"""
"""{2859C1AC-819B-52B4-E063-4804A…",540000,2024-06-27 00:00:00,"""KT17 2EW""","""S""","""N""","""F""","""28""","""""","""SHORTCROFT ROAD""","""""","""EPSOM""","""EPSOM AND EWELL""","""SURREY""","""A""","""A"""


In [21]:
catalog = RestCatalog(
    name="lakekeeper",
    warehouse="store",
    uri="http://lakekeeper:8181/catalog",
)

In [22]:
catalog.create_namespace_if_not_exists("house_prices")
catalog.create_namespace_if_not_exists("stocks")

In [23]:
from pyiceberg.schema import Schema, NestedField, StringType, IntegerType, DateType

house_prices_schema = Schema(
    NestedField(1, "transaction_id", StringType(), required=True),
    NestedField(2, "price", IntegerType(), required=True),
    NestedField(3, "date_of_transfer", DateType(), required=True),
    NestedField(4, "postcode", StringType(), required=True),
    NestedField(5, "property_type", StringType(), required=True),
    NestedField(6, "new_property", StringType(), required=True),
    NestedField(7, "duration", StringType(), required=True),
    NestedField(8, "paon", StringType()),
    NestedField(9, "saon", StringType()),
    NestedField(10, "street", StringType()),
    NestedField(11, "locality", StringType()),
    NestedField(12, "town", StringType()),
    NestedField(13, "district", StringType()),
    NestedField(14, "county", StringType()),
    NestedField(15, "ppd_category_type", StringType()),
    identifier_field_ids=[1]
)

In [24]:
raw_house_prices = catalog.create_table("house_prices.raw", schema=house_prices_schema, location="s3://warehouse/house_prices/raw")

# Metadata is king
Now that we've created a schema for our houseprices, let's take a look at the metadata that we've created. In Iceberg, all the metadata is stored in a combination of JSON and Avro, and all the metadata is stored in the S3 buckets directly, which is what makes it accessible from the various query engines. 

Let's have a look at the different files we've created out of the box. First, we need something that can talk to S3 - in this case our Minio S3 - enter fsspec and s3fs:

In [25]:
fs = s3fs.S3FileSystem(endpoint_url="http://minio:9000", key="minio", secret="minio1234")

Now that we have something that can read our S3 bucket in Minio, we need to know where our Iceberg Catalogue put our most recent table update. PyIceberg stores that information in the `metadata_location` of the table

In [27]:
raw_house_prices.metadata_location

's3://warehouse/house_prices/raw/metadata/00000-019630c4-9971-75f1-a653-900d831e740f.gz.metadata.json'

That's a gzipped json file, a choice that our Iceberg Rest Catalog has chosen for us, so we need to do some extra work to read our metadata

In [32]:
def get_iceberg_metadata(fs: AbstractFileSystem, table: Table) -> dict[str, Any]:
    with fs.open(table.metadata_location) as f, gzip.open(f) as g_f:
        return json.load(g_f)

In [33]:
get_iceberg_metadata(fs, raw_house_prices)

{'format-version': 2,
 'table-uuid': '019630c4-9970-7c32-b5e5-cc4f7398de20',
 'location': 's3://warehouse/house_prices/raw',
 'last-sequence-number': 0,
 'last-updated-ms': 1744574912881,
 'last-column-id': 15,
 'schemas': [{'schema-id': 0,
   'identifier-field-ids': [1],
   'type': 'struct',
   'fields': [{'id': 1,
     'name': 'transaction_id',
     'required': True,
     'type': 'string'},
    {'id': 2, 'name': 'price', 'required': True, 'type': 'int'},
    {'id': 3, 'name': 'date_of_transfer', 'required': True, 'type': 'date'},
    {'id': 4, 'name': 'postcode', 'required': True, 'type': 'string'},
    {'id': 5, 'name': 'property_type', 'required': True, 'type': 'string'},
    {'id': 6, 'name': 'new_property', 'required': True, 'type': 'string'},
    {'id': 7, 'name': 'duration', 'required': True, 'type': 'string'},
    {'id': 8, 'name': 'paon', 'required': False, 'type': 'string'},
    {'id': 9, 'name': 'saon', 'required': False, 'type': 'string'},
    {'id': 10, 'name': 'street', 

In [None]:
import avro

In [53]:
with fs.open(c["snapshots"][0]["manifest-list"]) as f:
    av = pl.read_avro(f)

In [61]:
with fs.open(av['manifest_path'].item()) as f:
    manifest = pl.read_avro(f)

In [62]:
manifest

status,snapshot_id,sequence_number,file_sequence_number,data_file
i32,i64,i64,i64,struct[16]
1,3473961880491101975,,,"{0,""s3://warehouse/test/data/00000-0-ba541152-c3fd-4c2f-8bcd-2c658fc02b4a.parquet"",""PARQUET"",{},3,975,[{1,123}, {2,123}],[{1,3}, {2,3}],[{1,0}, {2,0}],[],[{1,b""\x01\x00\x00\x00\x00\x00\x00\x00""}, {2,b""\x03\x00\x00\x00\x00\x00\x00\x00""}],[{1,b""\x03\x00\x00\x00\x00\x00\x00\x00""}, {2,b""\x05\x00\x00\x00\x00\x00\x00\x00""}],null,[4],null,null}"


In [63]:
manifest['data_file']

data_file
struct[16]
"{0,""s3://warehouse/test/data/00000-0-ba541152-c3fd-4c2f-8bcd-2c658fc02b4a.parquet"",""PARQUET"",{},3,975,[{1,123}, {2,123}],[{1,3}, {2,3}],[{1,0}, {2,0}],[],[{1,b""\x01\x00\x00\x00\x00\x00\x00\x00""}, {2,b""\x03\x00\x00\x00\x00\x00\x00\x00""}],[{1,b""\x03\x00\x00\x00\x00\x00\x00\x00""}, {2,b""\x05\x00\x00\x00\x00\x00\x00\x00""}],null,[4],null,null}"


In [74]:
manifest.unnest(pl.col('data_file')).select(pl.col("column_sizes").list.explode()).unnest("column_sizes")

key,value
i32,i64
1,123
2,123


In [7]:
catalog.drop_table("store.test", purge_requested=True)

In [1]:
from trino.dbapi import connect

conn = connect(host="http://trino:8080", user="trino", catalog="lakekeeper")

cur = conn.cursor()

cur.execute("DESCRIBE lakekeeper.store.test").fetchall()

[['a', 'bigint', '', ''], ['b', 'bigint', '', '']]

In [2]:
cur.execute('select * from lakekeeper.store.test').fetchall()

[[1, 3], [2, 4], [3, 5]]