In [1]:
from pyiceberg.catalog.rest import RestCatalog
import polars as pl
import datetime as dt
import sqlalchemy as sa

# The functions we defined in the previous notebook are defined in utils.py
from utils import get_iceberg_metadata, read_house_prices
from s3fs import S3FileSystem
from IPython.display import JSON

# Updating Metadata

We've added data to our tables and inspected how Iceberg keeps track of the data in the metadata files

Usually when working with data in real life, we make decisions that we regret in six months time. 

Now that we've added some data, we've found out that we've made a mistake - we should have added a `_loaded_at` column to our data, so that we can differentiate downstream between the source timestamp and our loaded time

In [2]:
# Get a reference to our catalog and table again
catalog = RestCatalog(
    "lakekeeper", uri="http://lakekeeper:8181/catalog", warehouse="lakehouse"
)
house_prices_t = catalog.load_table("housing.staging_prices")
fs = S3FileSystem(endpoint_url="http://minio:9000", key="minio", secret="minio1234")

In [3]:
timestamp = dt.datetime.now(tz=dt.UTC)
house_prices_2022 = read_house_prices("data/house_prices/pp-2022.csv").with_columns(
    pl.lit(timestamp).alias("_loaded_at")
)
house_prices_2022

transaction_id,price,date_of_transfer,postcode,property_type,new_property,duration,paon,saon,street,locality,town,district,county,ppd_category_type,record_status,_loaded_at
str,i64,date,str,str,str,str,str,str,str,str,str,str,str,str,str,"datetime[μs, UTC]"
"""{DBA933F9-D5BC-669D-E053-6B04A…",205000,2022-02-18,"""DL9 4RS""","""D""","""N""","""F""","""26""","""""","""MAPLE AVENUE""","""COLBURN""","""CATTERICK GARRISON""","""RICHMONDSHIRE""","""NORTH YORKSHIRE""","""A""","""A""",2025-06-06 08:35:08.585203 UTC
"""{DBA933F9-D5BE-669D-E053-6B04A…",220000,2022-02-14,"""YO12 7ND""","""S""","""N""","""F""","""3""","""""","""PEASHOLM GARDENS""","""""","""SCARBOROUGH""","""SCARBOROUGH""","""NORTH YORKSHIRE""","""A""","""A""",2025-06-06 08:35:08.585203 UTC
"""{DBA933F9-D5C0-669D-E053-6B04A…",775000,2022-02-22,"""HG5 0TT""","""D""","""N""","""F""","""WATERS NOOK""","""""","""YORK ROAD""","""""","""KNARESBOROUGH""","""HARROGATE""","""NORTH YORKSHIRE""","""A""","""A""",2025-06-06 08:35:08.585203 UTC
"""{DBA933F9-D5C6-669D-E053-6B04A…",450000,2022-03-04,"""YO31 1BU""","""D""","""N""","""F""","""116""","""""","""STOCKTON LANE""","""""","""YORK""","""YORK""","""YORK""","""A""","""A""",2025-06-06 08:35:08.585203 UTC
"""{DBA933F9-D5CC-669D-E053-6B04A…",175000,2022-02-25,"""LA2 7EB""","""T""","""N""","""F""","""BORRANS COTTAGES""","""3""","""BURTON ROAD""","""LOWER BENTHAM""","""LANCASTER""","""CRAVEN""","""NORTH YORKSHIRE""","""A""","""A""",2025-06-06 08:35:08.585203 UTC
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""{EA3278A9-E061-2676-E053-6B04A…",205000,2022-06-27,"""SK14 4AF""","""S""","""N""","""L""","""71""","""""","""DANBY CLOSE""","""""","""HYDE""","""TAMESIDE""","""GREATER MANCHESTER""","""A""","""A""",2025-06-06 08:35:08.585203 UTC
"""{EA3278A9-E062-2676-E053-6B04A…",195000,2022-09-02,"""BL2 5QZ""","""T""","""N""","""F""","""14""","""""","""BANKFIELD CLOSE""","""AINSWORTH""","""BOLTON""","""BURY""","""GREATER MANCHESTER""","""A""","""A""",2025-06-06 08:35:08.585203 UTC
"""{EA3278A9-E064-2676-E053-6B04A…",186000,2022-07-22,"""WN6 0SX""","""S""","""N""","""L""","""10""","""""","""BARBROOK CLOSE""","""STANDISH""","""WIGAN""","""WIGAN""","""GREATER MANCHESTER""","""A""","""A""",2025-06-06 08:35:08.585203 UTC
"""{EA3278A9-E065-2676-E053-6B04A…",225000,2022-08-19,"""M45 7FL""","""S""","""N""","""F""","""29""","""""","""BURY NEW ROAD""","""WHITEFIELD""","""MANCHESTER""","""BURY""","""GREATER MANCHESTER""","""A""","""A""",2025-06-06 08:35:08.585203 UTC


In [4]:
try:
    house_prices_t.upsert(house_prices_2022.to_arrow())
except ValueError as e:
    # Print out the error message instead of crashing
    print(e.args[0])

PyArrow table contains more columns: _loaded_at. Update the schema first (hint, use union_by_name).


Pyiceberg is preventing us from doing something we shouldn't - Iceberg has a fixed schema, so we can't just add arbitrary columns to it. We need to update the schema to accomodate our new column.

```{note}
Pyiceberg gives us the ability to do this within a transaction to live up to Iceberg's ACID guarantees.
```
The new schema is added to the Iceberg metadata in the `schemas` array. Note that each of our snapshots reference the schema at the time the data was written. That way Iceberg can keep track of the schema evolution.

In [5]:
from pyiceberg.types import TimestamptzType

with house_prices_t.update_schema() as schema:
    schema.add_column(
        "_loaded_at", TimestamptzType(), doc="The date this row was loaded"
    )

Looking at our metadata again - what's changed?

In [6]:
JSON(get_iceberg_metadata(fs, house_prices_t))

<IPython.core.display.JSON object>

Now we have our `_loaded_at` column as part of the table schema, Iceberg is happy for us to add our data with the new column

In [7]:
house_prices_t.append(
    house_prices_2022.to_arrow().cast(house_prices_t.schema().as_arrow())
)

JSON(get_iceberg_metadata(fs, house_prices_t))

<IPython.core.display.JSON object>

What about the data we already added? How would we modify that data? Here we start running into some limitations of a foundational library like `pyiceberg` - we can do it (bonus homework - how would you do it in pyiceberg natively?), but wouldn't it be much easier to write an `UPDATE` in SQL and not have to worry about the details?

This is the power of Iceberg - we have the ability to switch query engines to suit our usecase - in this case, I want to use Trino to update the data back in time.

Let's verify how many nulls we have - the data we just added should have `_loaded_at` filled in, but the rest should be null.

In [8]:
engine = sa.create_engine("trino://trino:@trino:8080/lakekeeper")

In [9]:
null_count_sql = """
SELECT
    COUNT(*) as total_rows,
    COUNT_IF(_loaded_at IS NULL) as null_count
FROM housing.staging_prices
    """
pl.read_database(null_count_sql, engine)

total_rows,null_count
i64,i64
2684736,1615076


A simple UPDATE in SQL saves us many lines of Python code

In [10]:
with engine.connect() as conn:
    sql = f"UPDATE housing.staging_prices SET _loaded_at = from_iso8601_timestamp('{timestamp.isoformat()}') WHERE _loaded_at is null"
    result = conn.execute(sa.text(sql))
    print(f"Updated {result.scalar_one():,} rows")

Updated 1,615,076 rows


In [11]:
pl.read_database(null_count_sql, engine)

total_rows,null_count
i64,i64
2684736,0


We should now have a new snapshot - let's have a peek.

```{warning} Keep metadata in sync
Pyiceberg doesn't yet know about our Trino update - we need to refresh the metadata to get the latest metadata from the catalog
```

In [12]:
house_prices_t.refresh();

In [13]:
JSON(get_iceberg_metadata(fs, house_prices_t))

<IPython.core.display.JSON object>

## Deletes

We have a new operation `overwrite` - Parquet is immutable, so we have to physically write out a new file and delete the old one. That is expensive, so Iceberg uses delete files to avoid having to up-front do the work of actually deleting data.

```{note} Aside
Technically, Parquet **row groups** are immutable, but it's much faster to treat the Parquet file as immutable, rather than rewriting row groups
```

The Iceberg V2 spec defines positional-deletes and equality-deletes. These are both represented by a `delete` file, which is just a parquet file which specifies rows to mark as deleted, either by a filter like `transaction_id = '{045A1898-4ABF-9A24-E063-4804A8C048EA}'` or by position, like this:
```{code} parquet
:filename: some_random_id.parquet
file_path,pos
s3://warehouse/house_prices/raw/data/00000-0-0ab09c23-d71c-4686-968a-f5ebd7b2e32a.parquet,0
s3://warehouse/house_prices/raw/data/00000-0-0ab09c23-d71c-4686-968a-f5ebd7b2e32a.parquet,1
s3://warehouse/house_prices/raw/data/00000-0-0ab09c23-d71c-4686-968a-f5ebd7b2e32a.parquet,2
s3://warehouse/house_prices/raw/data/00000-0-0ab09c23-d71c-4686-968a-f5ebd7b2e32a.parquet,3
```

```{warning} Deprecation Warning
Positional deletes will be replaced by deletion vectors in Iceberg V3
```

Let's use pyiceberg to find a delete file and open it up

In [14]:
delete_file = (
    pl.from_arrow(house_prices_t.inspect.delete_files())
    .select(pl.col("file_path"))
    .item(0, 0)
)
with pl.Config(fmt_str_lengths=100):
    display(pl.read_parquet(fs.read_bytes(delete_file)))

file_path,pos
str,i64
"""s3://warehouse/housing/staging/data/00000-0-7f71c78a-7820-42b0-89f7-595698c57d10.parquet""",0
"""s3://warehouse/housing/staging/data/00000-0-7f71c78a-7820-42b0-89f7-595698c57d10.parquet""",1
"""s3://warehouse/housing/staging/data/00000-0-7f71c78a-7820-42b0-89f7-595698c57d10.parquet""",2
"""s3://warehouse/housing/staging/data/00000-0-7f71c78a-7820-42b0-89f7-595698c57d10.parquet""",3
"""s3://warehouse/housing/staging/data/00000-0-7f71c78a-7820-42b0-89f7-595698c57d10.parquet""",4
…,…
"""s3://warehouse/housing/staging/data/00000-0-7f71c78a-7820-42b0-89f7-595698c57d10.parquet""",848430
"""s3://warehouse/housing/staging/data/00000-0-7f71c78a-7820-42b0-89f7-595698c57d10.parquet""",848431
"""s3://warehouse/housing/staging/data/00000-0-7f71c78a-7820-42b0-89f7-595698c57d10.parquet""",848432
"""s3://warehouse/housing/staging/data/00000-0-7f71c78a-7820-42b0-89f7-595698c57d10.parquet""",848433


# Renaming and moving columns

Iceberg implements all column references use the `field_id`. This makes it trivial to rename a column, since we just have to update the metadata of the schema. Imagine our style guide is updated and now all metadata fields such as our `_loaded_at` should now be prefixed with `dwh` to make it clear who did the load. Now that we have some hands-on user feedback, we also want to move `transfer_date` to be the first column since we're often visually exploring date ranges.

We can also show off transactions - everything we've done until now has actually been done inside a transaction. We can explicitly open a transaction to perform multiple operations inside a single transaction. This includes deleting and adding files, but for now we'll just make our changes



In [15]:
with house_prices_t.transaction() as transaction:
    with transaction.update_schema() as update:
        update.rename_column("_loaded_at", "_dwh_loaded_at")
        update.move_first("date_of_transfer")

In [16]:
pl.scan_iceberg(house_prices_t).head().collect()

date_of_transfer,transaction_id,price,postcode,property_type,new_property,duration,paon,saon,street,locality,town,district,county,ppd_category_type,record_status,_dwh_loaded_at
date,str,i32,str,str,str,str,str,str,str,str,str,str,str,str,str,"datetime[μs, UTC]"
2023-09-22,"""{0E082196-CE18-5C09-E063-4704A…",221000,"""PL6 6JX""","""T""","""N""","""F""","""3""","""""","""PILLAR WALK""","""""","""PLYMOUTH""","""CITY OF PLYMOUTH""","""CITY OF PLYMOUTH""","""A""","""A""",2025-06-06 08:35:08.585 UTC
2023-08-25,"""{0E082196-CE19-5C09-E063-4704A…",228000,"""PL7 1SJ""","""S""","""N""","""F""","""102""","""""","""MERAFIELD ROAD""","""""","""PLYMOUTH""","""CITY OF PLYMOUTH""","""CITY OF PLYMOUTH""","""A""","""A""",2025-06-06 08:35:08.585 UTC
2023-10-26,"""{0E082196-CE1A-5C09-E063-4704A…",480000,"""TQ6 0AS""","""F""","""N""","""L""","""1A""","""""","""RIVER VIEW""","""KINGSWEAR""","""DARTMOUTH""","""SOUTH HAMS""","""DEVON""","""A""","""A""",2025-06-06 08:35:08.585 UTC
2023-07-14,"""{0E082196-CE1B-5C09-E063-4704A…",625000,"""TQ1 2HB""","""D""","""N""","""F""","""14""","""""","""OXLEA CLOSE""","""""","""TORQUAY""","""TORBAY""","""TORBAY""","""A""","""A""",2025-06-06 08:35:08.585 UTC
2023-08-04,"""{0E082196-CE1C-5C09-E063-4704A…",174000,"""PL2 1LL""","""T""","""N""","""F""","""58""","""""","""ST AUBYN AVENUE""","""""","""PLYMOUTH""","""CITY OF PLYMOUTH""","""CITY OF PLYMOUTH""","""A""","""A""",2025-06-06 08:35:08.585 UTC


In [17]:
JSON(get_iceberg_metadata(fs, house_prices_t))

<IPython.core.display.JSON object>