In [1]:
from pyiceberg.catalog.rest import RestCatalog
import polars as pl
import datetime as dt
import sqlalchemy as sa

# The functions we defined in the previous notebook are defined in utils.py
from utils import get_iceberg_manifest, get_iceberg_manifest_list, get_iceberg_metadata, read_house_prices
from s3fs import S3FileSystem
from IPython.display import JSON

# Updating Metadata

We've added data to our tables and inspected how Iceberg keeps track of the data in the metadata files

Usually when working with data in real life, we make decisions that we regret in six months time. 

Now that we've added some data, we've found out that we've made a mistake - we should have added a `_loaded_at` column to our data, so that we can differentiate downstream between the source timestamp and our loaded time

In [2]:
# Get a reference to our catalog and table again
catalog = RestCatalog("lakekeeper", uri="http://lakekeeper:8181/catalog", warehouse="lakehouse")
house_prices = catalog.load_table('house_prices.raw')
fs = S3FileSystem(endpoint_url="http://minio:9000", key="minio", secret="minio1234")

In [3]:
timestamp = dt.datetime.now(tz=dt.UTC)
house_prices_2022 = read_house_prices("data/pp-2022.csv").with_columns(pl.lit(timestamp).alias("_loaded_at"))
house_prices_2022

transaction_id,price,date_of_transfer,postcode,property_type,new_property,duration,paon,saon,street,locality,town,district,county,ppd_category_type,record_status,_loaded_at
str,i64,date,str,str,str,str,str,str,str,str,str,str,str,str,str,"datetime[μs, UTC]"
"""{045A1898-4ABF-9A24-E063-4804A…",407400,2022-04-28,"""LU7 3FZ""","""S""","""Y""","""F""","""68""","""""","""RAMSAY DRIVE""","""""","""LEIGHTON BUZZARD""","""CENTRAL BEDFORDSHIRE""","""CENTRAL BEDFORDSHIRE""","""A""","""A""",2025-05-05 20:37:18.144898 UTC
"""{045A1898-4AC1-9A24-E063-4804A…",357000,2022-05-27,"""LU7 3QS""","""S""","""Y""","""F""","""44""","""""","""CHADWICK CRESCENT""","""""","""LEIGHTON BUZZARD""","""CENTRAL BEDFORDSHIRE""","""CENTRAL BEDFORDSHIRE""","""A""","""A""",2025-05-05 20:37:18.144898 UTC
"""{045A1898-4AC2-9A24-E063-4804A…",372950,2022-04-28,"""LU5 6TD""","""S""","""Y""","""F""","""11""","""""","""SKYE GARDENS""","""HOUGHTON REGIS""","""DUNSTABLE""","""CENTRAL BEDFORDSHIRE""","""CENTRAL BEDFORDSHIRE""","""A""","""A""",2025-05-05 20:37:18.144898 UTC
"""{045A1898-4AC4-9A24-E063-4804A…",570000,2022-05-31,"""SG18 9RF""","""D""","""Y""","""F""","""8""","""""","""HARVEST MOUSE PLACE""","""LANGFORD""","""BIGGLESWADE""","""CENTRAL BEDFORDSHIRE""","""CENTRAL BEDFORDSHIRE""","""A""","""A""",2025-05-05 20:37:18.144898 UTC
"""{045A1898-4AC5-9A24-E063-4804A…",570000,2022-05-27,"""LU7 3QS""","""D""","""Y""","""F""","""42""","""""","""CHADWICK CRESCENT""","""""","""LEIGHTON BUZZARD""","""CENTRAL BEDFORDSHIRE""","""CENTRAL BEDFORDSHIRE""","""A""","""A""",2025-05-05 20:37:18.144898 UTC
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""{E2D14905-55E3-4C2D-E053-6B04A…",432000,2022-05-20,"""RM9 5UL""","""T""","""N""","""F""","""59""","""""","""CONNOR ROAD""","""""","""DAGENHAM""","""BARKING AND DAGENHAM""","""GREATER LONDON""","""A""","""A""",2025-05-05 20:37:18.144898 UTC
"""{E2D14905-55E4-4C2D-E053-6B04A…",510000,2022-04-29,"""E4 8PH""","""T""","""N""","""F""","""27""","""""","""SINCLAIR ROAD""","""""","""LONDON""","""WALTHAM FOREST""","""GREATER LONDON""","""A""","""A""",2025-05-05 20:37:18.144898 UTC
"""{E2D14905-55E5-4C2D-E053-6B04A…",635000,2022-05-23,"""E17 4PN""","""T""","""N""","""F""","""133""","""""","""CHINGFORD ROAD""","""""","""LONDON""","""WALTHAM FOREST""","""GREATER LONDON""","""A""","""A""",2025-05-05 20:37:18.144898 UTC
"""{E2D14905-55E6-4C2D-E053-6B04A…",895000,2022-04-01,"""IG8 9EH""","""T""","""N""","""F""","""35""","""""","""ST ALBANS CRESCENT""","""""","""WOODFORD GREEN""","""REDBRIDGE""","""GREATER LONDON""","""A""","""A""",2025-05-05 20:37:18.144898 UTC


In [None]:
try:
    house_prices.upsert(house_prices_2022.to_arrow())
except ValueError as e:
    # Print out the error message instead of crashing
    print(e.args[0])

Pyiceberg is preventing us from doing something we shouldn't - Iceberg has a fixed schema, so we can't just add arbitrary columns to it. We need to update the schema to accomodate our new column.

```{note}
Pyiceberg gives us the ability to do this within a transaction to live up to Iceberg's ACID guarantees.
```
The new schema is added to the Iceberg metadata in the `schemas` array. Note that each of our snapshots reference the schema at the time the data was written. That way Iceberg can keep track of the schema evolution.

In [5]:
from pyiceberg.types import TimestamptzType

with house_prices.update_schema() as schema:
    # Avoid crashing for demo purposes
    if "_loaded_at" not in house_prices.schema().column_names:
        schema.add_column("_loaded_at", TimestamptzType(), doc="The date this row was loaded")
    else:
        print("_loaded_at already in schema")

JSON(get_iceberg_metadata(fs, house_prices))

_loaded_at already in schema


<IPython.core.display.JSON object>

Now we have our `_loaded_at` column as part of the table schema, Iceberg is happy for us to add our data with the new column

In [None]:
house_prices.append(house_prices_2022.to_arrow().cast(house_prices.schema().as_arrow()))

What about the data we already added? How would we modify that data? Here we start running into some limitations of a foundational library like `pyiceberg` - we can do it, but wouldn't it be much easier to write an `UPDATE` in SQL and not have to worry about the details?

This is the power of Iceberg - we have the ability to switch query engines to suit our usecase - in this case, I want to use Trino to update the data back in time.

Let's verify how many nulls we have - the data we just added should have `_loaded_at` filled in, but the rest should be null

Trino has a SQLAlchemy dialect built-in to the `trino` python package, so it's straightforward to run some SQL like we're used to

In [3]:
engine = sa.create_engine("trino://trino:@trino:8080/lakekeeper")

In [4]:
null_count_sql = """
SELECT
    COUNT(*) as total_rows,
    COUNT_IF(_loaded_at IS NULL) as null_count
FROM house_prices.raw
    """
with engine.connect() as conn:
    df = pl.read_database(null_count_sql, conn)
df

DBAPIError: (trino.exceptions.TrinoQueryError) TrinoQueryError(type=INTERNAL_ERROR, name=GENERIC_INTERNAL_ERROR, message="io.trino.spi.TrinoException: Error processing metadata for table house_prices.raw", query_id=20250508_205051_00006_cpaje)
[SQL: 
SELECT
    COUNT(*) as total_rows,
    COUNT_IF(_loaded_at IS NULL) as null_count
FROM house_prices.raw
    ]
(Background on this error at: https://sqlalche.me/e/20/dbapi)

A simple UPDATE in SQL saves us many lines of Python code

In [None]:
with engine.connect() as conn:
    sql = f"UPDATE house_prices.raw SET _loaded_at = from_iso8601_timestamp('{timestamp.isoformat()}') WHERE _loaded_at is null"
    result = conn.execute(sa.text(sql))
    print(result.fetchone())

In [13]:
with engine.connect() as conn:
    df = pl.read_database(null_count_sql, conn)
df

total_rows,null_count
i64,i64
1546116,0


We should now have a new snapshot - let's have a peek

In [15]:
JSON(get_iceberg_metadata(fs, house_prices))

<IPython.core.display.JSON object>

## Deletes

We have a new operation `overwrite` - Parquet is immutable, so we have to physically write out a new file and delete the old one. That is expensive, so Iceberg uses delete files to avoid having to up-front do the work of actually deleting data.

In Iceberg V2, there are positional-deletes and equality-deletes. These are both represented by a new delete file, which is just a parquet file which specifies rows to mark as deleted, either by a filter like `transaction_id = '{045A1898-4ABF-9A24-E063-4804A8C048EA}'` or by position, like in this example

```{code} parquet
:filename: some_random_id.parquet
file_path,pos
s3://warehouse/house_prices/raw/data/00000-0-0ab09c23-d71c-4686-968a-f5ebd7b2e32a.parquet,0
s3://warehouse/house_prices/raw/data/00000-0-0ab09c23-d71c-4686-968a-f5ebd7b2e32a.parquet,1
s3://warehouse/house_prices/raw/data/00000-0-0ab09c23-d71c-4686-968a-f5ebd7b2e32a.parquet,2
s3://warehouse/house_prices/raw/data/00000-0-0ab09c23-d71c-4686-968a-f5ebd7b2e32a.parquet,3
```

```{warning} Deprecation Warning
Positional deletes will be replaced by deletion vectors in Iceberg V3
```

In [None]:
get_iceberg_manifest_list(fs, house_prices)