# Time Travelling

Another advantage of Iceberg's metadata structure is that it gives us Time Travel for free. Since all we're doing is storing snapshots and moving pointers, time travelling is essentially just asking to see the data at a previous pointer. 

In [18]:
import sqlalchemy as sa
import polars as pl
from pyiceberg.catalog.rest import RestCatalog

In [19]:
engine = sa.create_engine("trino://trino:@trino:8080/lakekeeper")
catalog = RestCatalog("lakekeeper", uri="http://lakekeeper:8181/catalog", warehouse="lakehouse")
house_prices_t = catalog.load_table("house_prices.raw")

def query(sql) -> pl.DataFrame:
    with engine.connect() as conn:
        return pl.read_database(sql, conn)

In [14]:
query('SELECT * FROM house_prices."raw$history" order by made_current_at')

made_current_at,snapshot_id,parent_id,is_current_ancestor
"datetime[μs, UTC]",i64,i64,bool
2025-04-24 20:46:05.919 UTC,1084111561377484783,,True
2025-04-24 20:47:19.741 UTC,1499055377590105214,1.0841115613774848e+18,True
2025-04-29 19:20:48.168 UTC,7565802258999413595,1.499055377590105e+18,True
2025-04-29 19:20:49.461 UTC,7113522245516818984,7.565802258999413e+18,True
2025-04-29 20:07:07.276 UTC,8628247025389525262,7.113522245516818e+18,True
2025-04-29 20:23:31.434 UTC,7551778031070669536,8.628247025389525e+18,True


In [15]:
query('SELECT count(transaction_id) as num_rows FROM house_prices.raw')

num_rows
i64
4638348


In [17]:
query('SELECT count(transaction_id) as num_rows from house_prices.raw for version as of 1084111561377484783')

num_rows
i64
704344


In [24]:
house_prices_t.scan(snapshot_id=1084111561377484783, selected_fields=['transaction_id']).to_arrow().num_rows

704344

In [38]:
pl.scan_iceberg(house_prices_t, snapshot_id=1084111561377484783).select(pl.count("transaction_id")).collect()

transaction_id
u32
704344


In [27]:
query("SELECT count(transaction_id) as num_rows from house_prices.raw for timestamp as of date '2025-04-25'")

num_rows
i64
1546116


Remembering these snapshot ids or pinpointing the exact time we're interested in is tricky for our human brains, so Iceberg supports tagging so that we can provide human-readable references to a given snapshot.

In [28]:
house_prices_t.manage_snapshots().create_tag(1084111561377484783, "initial commit").commit()

In [32]:
house_prices_t.inspect.refs()

pyarrow.Table
name: string not null
type: dictionary<values=string, indices=int32, ordered=0> not null
snapshot_id: int64 not null
max_reference_age_in_ms: int64
min_snapshots_to_keep: int32
max_snapshot_age_in_ms: int64
----
name: [["initial commit","main"]]
type: [  -- dictionary:
["TAG","BRANCH"]  -- indices:
[0,1]]
snapshot_id: [[1084111561377484783,7551778031070669536]]
max_reference_age_in_ms: [[null,null]]
min_snapshots_to_keep: [[null,null]]
max_snapshot_age_in_ms: [[null,null]]

In [34]:
query("SELECT count(transaction_id) as num_rows from house_prices.raw for version as of 'initial commit'")

num_rows
i64
704344


In [39]:
pl.scan_iceberg(house_prices_t, snapshot_id=house_prices_t.snapshot_by_name('initial commit').snapshot_id).select(pl.count('transaction_id')).collect()

transaction_id
u32
704344


In [46]:
with engine.connect() as conn:
    print(conn.execute(sa.text("ALTER TABLE house_prices.raw EXECUTE rollback_to_snapshot(1084111561377484783)")).fetchone())

None


In [47]:
query('SELECT * FROM house_prices."raw$history" order by made_current_at')

made_current_at,snapshot_id,parent_id,is_current_ancestor
"datetime[μs, UTC]",i64,i64,bool
2025-04-24 20:46:05.919 UTC,1084111561377484783,,True
2025-04-24 20:47:19.741 UTC,1499055377590105214,1.0841115613774848e+18,False
2025-04-29 19:20:48.168 UTC,7565802258999413595,1.499055377590105e+18,False
2025-04-29 19:20:49.461 UTC,7113522245516818984,7.565802258999413e+18,False
2025-04-29 20:07:07.276 UTC,8628247025389525262,7.113522245516818e+18,False
2025-04-29 20:23:31.434 UTC,7551778031070669536,8.628247025389525e+18,False


In [52]:
house_prices_t.refresh().current_snapshot().snapshot_id

1084111561377484783

In [65]:
house_prices_t.delete_orphan_files()

AttributeError: 'Table' object has no attribute 'delete_orphan_files'

In [66]:
with engine.connect() as conn:
    print(conn.execute(sa.text("ALTER table house_prices.raw execute optimize")).fetchone())

None


In [67]:
query('SELECT * FROM house_prices."raw$history" order by made_current_at')

made_current_at,snapshot_id,parent_id,is_current_ancestor
"datetime[μs, UTC]",i64,i64,bool
2025-04-24 20:46:05.919 UTC,1084111561377484783,,True


In [73]:
with engine.connect() as conn:
    print(conn.execute(sa.text("ALTER table house_prices.raw execute remove_orphan_files(retention_threshold => '7d')")).fetchone())

OperationalError: (trino.exceptions.TrinoExternalError) TrinoExternalError(type=EXTERNAL, name=ICEBERG_FILESYSTEM_ERROR, message="Failed accessing data for table: house_prices.raw", query_id=20250510_093938_00054_kmmf4)
[SQL: ALTER table house_prices.raw execute remove_orphan_files(retention_threshold => '7d')]
(Background on this error at: https://sqlalche.me/e/20/e3q8)