In [1]:
from pyiceberg.catalog.rest import RestCatalog
import polars as pl
import datetime as dt
import sqlalchemy as sa

# The functions we defined in the previous notebook are defined in utils.py
from utils import get_iceberg_manifest, get_iceberg_manifest_list, get_iceberg_metadata, read_house_prices, get_iceberg_data_file
from s3fs import S3FileSystem
from IPython.display import JSON

# Updating Metadata

We've added data to our tables and inspected how Iceberg keeps track of the data in the metadata files

Usually when working with data in real life, we make decisions that we regret in six months time. 

Now that we've added some data, we've found out that we've made a mistake - we should have added a `_loaded_at` column to our data, so that we can differentiate downstream between the source timestamp and our loaded time

In [2]:
# Get a reference to our catalog and table again
catalog = RestCatalog("lakekeeper", uri="http://lakekeeper:8181/catalog", warehouse="lakehouse")
house_prices_t = catalog.load_table('house_prices.raw')
fs = S3FileSystem(endpoint_url="http://minio:9000", key="minio", secret="minio1234")

In [3]:
timestamp = dt.datetime.now(tz=dt.UTC)
house_prices_2022 = read_house_prices("data/pp-2022.csv").with_columns(pl.lit(timestamp).alias("_loaded_at"))
house_prices_2022

transaction_id,price,date_of_transfer,postcode,property_type,new_property,duration,paon,saon,street,locality,town,district,county,ppd_category_type,record_status,_loaded_at
str,i64,date,str,str,str,str,str,str,str,str,str,str,str,str,str,"datetime[μs, UTC]"
"""{06C9F487-D94B-9388-E063-4804A…",330000,2023-03-20,"""CF14 7BX""","""T""","""N""","""F""","""32""","""""","""HEOL PANT Y CELYN""","""""","""CARDIFF""","""CARDIFF""","""CARDIFF""","""A""","""A""",2025-05-10 20:50:32.402229 UTC
"""{06C9F487-D94C-9388-E063-4804A…",269950,2023-07-25,"""LL28 4SH""","""D""","""N""","""F""","""7""","""""","""MARSTON DRIVE""","""RHOS ON SEA""","""COLWYN BAY""","""CONWY""","""CONWY""","""A""","""A""",2025-05-10 20:50:32.402229 UTC
"""{06C9F487-D94D-9388-E063-4804A…",280000,2023-08-10,"""LL31 9BN""","""D""","""N""","""F""","""PLAS COLWYN""","""""","""LLYS HELYG""","""DEGANWY""","""CONWY""","""CONWY""","""CONWY""","""A""","""A""",2025-05-10 20:50:32.402229 UTC
"""{06C9F487-D94E-9388-E063-4804A…",699999,2023-08-24,"""SA62 6BA""","""D""","""N""","""F""","""MIDDLE LOCHVANE""","""""","""""","""PEN Y CWM""","""HAVERFORDWEST""","""PEMBROKESHIRE""","""PEMBROKESHIRE""","""A""","""A""",2025-05-10 20:50:32.402229 UTC
"""{06C9F487-D94F-9388-E063-4804A…",160000,2023-08-21,"""SY16 1QY""","""T""","""N""","""F""","""167""","""""","""LON DOLAFON""","""""","""NEWTOWN""","""POWYS""","""POWYS""","""A""","""A""",2025-05-10 20:50:32.402229 UTC
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""{01EB45F0-A054-40F3-E063-4704A…",325000,2023-07-10,"""TW15 2QU""","""F""","""N""","""L""","""11A""","""""","""CHAUCER ROAD""","""""","""ASHFORD""","""SPELTHORNE""","""SURREY""","""B""","""A""",2025-05-10 20:50:32.402229 UTC
"""{01EB45F0-A055-40F3-E063-4704A…",275000,2023-06-29,"""GU1 1EB""","""F""","""N""","""L""","""59""","""""","""FARADAY ROAD""","""""","""GUILDFORD""","""GUILDFORD""","""SURREY""","""B""","""A""",2025-05-10 20:50:32.402229 UTC
"""{01EB45F0-A05B-40F3-E063-4704A…",150000,2023-07-06,"""GU7 1FP""","""F""","""N""","""L""","""THORNBROOK HOUSE""","""FLAT 20""","""WEYSIDE PARK""","""""","""GODALMING""","""WAVERLEY""","""SURREY""","""B""","""A""",2025-05-10 20:50:32.402229 UTC
"""{01EB45F0-A05D-40F3-E063-4704A…",347632,2023-01-23,"""CR6 9HF""","""F""","""N""","""L""","""LANGTON HOUSE, 126""","""FLAT 27""","""WESTHALL ROAD""","""""","""WARLINGHAM""","""TANDRIDGE""","""SURREY""","""B""","""A""",2025-05-10 20:50:32.402229 UTC


In [4]:
try:
    house_prices_t.upsert(house_prices_2022.to_arrow())
except ValueError as e:
    # Print out the error message instead of crashing
    print(e.args[0])

PyArrow table contains more columns: _loaded_at. Update the schema first (hint, use union_by_name).


Pyiceberg is preventing us from doing something we shouldn't - Iceberg has a fixed schema, so we can't just add arbitrary columns to it. We need to update the schema to accomodate our new column.

```{note}
Pyiceberg gives us the ability to do this within a transaction to live up to Iceberg's ACID guarantees.
```
The new schema is added to the Iceberg metadata in the `schemas` array. Note that each of our snapshots reference the schema at the time the data was written. That way Iceberg can keep track of the schema evolution.

In [5]:
from pyiceberg.types import TimestamptzType

with house_prices_t.update_schema() as schema:
    # Avoid crashing for demo purposes
    if "_loaded_at" not in house_prices.schema().column_names:
        schema.add_column("_loaded_at", TimestamptzType(), doc="The date this row was loaded")
    else:
        print("_loaded_at already in schema")

JSON(get_iceberg_metadata(fs, house_prices_t))

<IPython.core.display.JSON object>

Now we have our `_loaded_at` column as part of the table schema, Iceberg is happy for us to add our data with the new column

In [6]:
house_prices_t.append(house_prices_2022.to_arrow().cast(house_prices.schema().as_arrow()))



What about the data we already added? How would we modify that data? Here we start running into some limitations of a foundational library like `pyiceberg` - we can do it, but wouldn't it be much easier to write an `UPDATE` in SQL and not have to worry about the details?

This is the power of Iceberg - we have the ability to switch query engines to suit our usecase - in this case, I want to use Trino to update the data back in time.

Let's verify how many nulls we have - the data we just added should have `_loaded_at` filled in, but the rest should be null

Trino has a SQLAlchemy dialect built-in to the `trino` python package, so it's straightforward to run some SQL like we're used to

In [4]:
engine = sa.create_engine("trino://trino:@trino:8080/lakekeeper")

In [5]:
null_count_sql = """
SELECT
    COUNT(*) as total_rows,
    COUNT_IF(_loaded_at IS NULL) as null_count
FROM house_prices.raw
    """
with engine.connect() as conn:
    df = pl.read_database(null_count_sql, conn)
df

total_rows,null_count
i64,i64
2387888,0


A simple UPDATE in SQL saves us many lines of Python code

In [6]:
with engine.connect() as conn:
    sql = f"UPDATE house_prices.raw SET _loaded_at = from_iso8601_timestamp('{timestamp.isoformat()}') WHERE _loaded_at is null"
    result = conn.execute(sa.text(sql))
    print(result.fetchone())

(0,)


In [7]:
with engine.connect() as conn:
    df = pl.read_database(null_count_sql, conn)
df

total_rows,null_count
i64,i64
2387888,0


We should now have a new snapshot - let's have a peek

In [8]:
JSON(get_iceberg_metadata(fs, house_prices_t))

<IPython.core.display.JSON object>

## Deletes

We have a new operation `overwrite` - Parquet is immutable, so we have to physically write out a new file and delete the old one. That is expensive, so Iceberg uses delete files to avoid having to up-front do the work of actually deleting data.

In Iceberg V2, there are positional-deletes and equality-deletes. These are both represented by a new delete file, which is just a parquet file which specifies rows to mark as deleted, either by a filter like `transaction_id = '{045A1898-4ABF-9A24-E063-4804A8C048EA}'` or by position, like in this example

```{code} parquet
:filename: some_random_id.parquet
file_path,pos
s3://warehouse/house_prices/raw/data/00000-0-0ab09c23-d71c-4686-968a-f5ebd7b2e32a.parquet,0
s3://warehouse/house_prices/raw/data/00000-0-0ab09c23-d71c-4686-968a-f5ebd7b2e32a.parquet,1
s3://warehouse/house_prices/raw/data/00000-0-0ab09c23-d71c-4686-968a-f5ebd7b2e32a.parquet,2
s3://warehouse/house_prices/raw/data/00000-0-0ab09c23-d71c-4686-968a-f5ebd7b2e32a.parquet,3
```

```{warning} Deprecation Warning
Positional deletes will be replaced by deletion vectors in Iceberg V3
```

In [13]:
delete_file = pl.from_arrow(house_prices_t.inspect.delete_files()).select(pl.col('file_path')).item(0, 0)
pl.read_parquet(fs.read_bytes(delete_file))



file_path,pos
str,i64
"""s3://warehouse/house_prices/ra…",0
"""s3://warehouse/house_prices/ra…",1
"""s3://warehouse/house_prices/ra…",2
"""s3://warehouse/house_prices/ra…",3
"""s3://warehouse/house_prices/ra…",4
…,…
"""s3://warehouse/house_prices/ra…",704339
"""s3://warehouse/house_prices/ra…",704340
"""s3://warehouse/house_prices/ra…",704341
"""s3://warehouse/house_prices/ra…",704342
