In [23]:
import sqlalchemy as sa
from utils import query, engine, catalog

In [19]:
import sqlalchemy as sa

dim_address_sql = """
CREATE OR REPLACE TABLE house_prices.dim_address 
    WITH ( partitioning = ARRAY['bucket(address_id, 10)'] )
    AS (
    SELECT DISTINCT to_hex(md5(cast(
        coalesce(paon, '') ||
        coalesce(saon, '') ||
        coalesce(street, '') ||
        coalesce(locality, '') ||
        coalesce(town, '') ||
        coalesce(district, '') ||
        coalesce(county, '') ||
        coalesce(postcode, '')
    as varbinary))) AS address_id,
      paon,
      saon,
      street,
      locality,
      town,
      district,
      county,
      postcode
FROM house_prices.raw)
"""

In [20]:
fct_prices = """
CREATE OR REPLACE TABLE house_prices.fct_house_prices
    WITH ( partitioning = ARRAY['month(date_of_transfer)'] ) AS (
        WITH ranked_records AS (
            SELECT *,
            ROW_NUMBER () OVER (PARTITION BY transaction_id ORDER BY month(date_of_transfer) DESC) AS rn
            FROM house_prices.raw
    ),
    latest_records AS (
        SELECT *
        FROM ranked_records
        WHERE rn = 1
    ),
    with_address_id AS (
        SELECT to_hex(md5(cast (
                coalesce(paon, '') ||
                coalesce(saon, '') ||
                coalesce(street, '') ||
                coalesce(locality, '') ||
                coalesce(town, '') ||
                coalesce(district, '') ||
                coalesce(county, '') ||
                coalesce(postcode, '')
            as varbinary))) AS address_id,
                transaction_id,
                price,
                date_of_transfer,
                property_type,
                new_property,
                duration,
                ppd_category_type,
                record_status
        FROM latest_records
        WHERE record_status != 'D'
    )
    SELECT *
    FROM with_address_id
    )
"""

In [21]:
with engine.begin() as conn:
    num_rows_dim_address = conn.execute(sa.text(dim_address_sql)).fetchone()[0]
    num_rows_fct_prices = conn.execute(sa.text(fct_prices)).fetchone()[0]

print(f"Created dim_address with {num_rows_dim_address:,} rows")
print(f"Created fct_prices with {num_rows_fct_prices:,} rows")

Created dim_address with 7,498,409 rows
Created fct_prices with 8,890,034 rows


In [26]:
fct_house_prices_t = catalog.load_table("house_prices.fct_house_prices")

In [22]:
import polars as pl

In [39]:
sold_address_ids = pl.scan_iceberg(fct_house_prices_t).group_by("address_id").len().filter(pl.col("len") > 1)
sold_address_ids.collect()



address_id,len
str,u32
"""3326ABE173322B3C8C14D6C9574230…",2
"""F8A4B2DE7DA9459FD2CF9D11C9D136…",3
"""BBF360ABEE6F8DD74FD208E2968E20…",2
"""947EC8BE53918D02F7BF9DEDE2FCD1…",2
"""A3D68E0390F24C698DB178451B629E…",2
…,…
"""6F8BF2E3D9662B2EF6519536A5A20D…",2
"""260FA6B447CC8DCCA516513B812348…",2
"""9E0FAE2D7D138EEE20E1EC5886FA67…",2
"""98B9FD1939C797ED9BD1BDBDA5FDD0…",2


In [43]:
(
    pl.scan_iceberg(fct_house_prices_t).join(sold_address_ids, on="address_id", how="inner")
    .group_by(pl.col("address_id"), pl.col("date_of_transfer"))
    .select(
        pl.col("address_id").first(),
        pl.col("date_of_transfer").first(),
        pl.col("price").first().alias("original_price"),
        pl.col("price").last().alias("sold_price"))
        .collect()
)

address_id,date_of_transfer,original_price,sold_price
str,date,i32,i32
"""2F32FF4436392AE53A9F51CEFBA9DA…",2015-01-01,149000,605000
