In [1]:
import sqlalchemy as sa
from utils import query, engine, catalog

In [2]:
import sqlalchemy as sa

dim_address_sql = """
CREATE OR REPLACE TABLE house_prices.dim_address 
    WITH ( partitioning = ARRAY['bucket(address_id, 10)'] )
    AS (
    SELECT DISTINCT to_hex(md5(cast(
        coalesce(paon, '') ||
        coalesce(saon, '') ||
        coalesce(street, '') ||
        coalesce(locality, '') ||
        coalesce(town, '') ||
        coalesce(district, '') ||
        coalesce(county, '') ||
        coalesce(postcode, '')
    as varbinary))) AS address_id,
      paon,
      saon,
      street,
      locality,
      town,
      district,
      county,
      postcode
FROM house_prices.raw)
"""

In [3]:
fct_prices = """
CREATE OR REPLACE TABLE house_prices.fct_house_prices
    WITH ( partitioning = ARRAY['month(date_of_transfer)'] ) AS (
        WITH ranked_records AS (
            SELECT *,
            ROW_NUMBER () OVER (PARTITION BY transaction_id ORDER BY month(date_of_transfer) DESC) AS rn
            FROM house_prices.raw
    ),
    latest_records AS (
        SELECT *
        FROM ranked_records
        WHERE rn = 1
    ),
    with_address_id AS (
        SELECT to_hex(md5(cast (
                coalesce(paon, '') ||
                coalesce(saon, '') ||
                coalesce(street, '') ||
                coalesce(locality, '') ||
                coalesce(town, '') ||
                coalesce(district, '') ||
                coalesce(county, '') ||
                coalesce(postcode, '')
            as varbinary))) AS address_id,
                transaction_id,
                price,
                date_of_transfer,
                property_type,
                new_property,
                duration,
                ppd_category_type,
                record_status
        FROM latest_records
        WHERE record_status != 'D'
    )
    SELECT *
    FROM with_address_id
    )
"""

In [4]:
with engine.begin() as conn:
    num_rows_dim_address = conn.execute(sa.text(dim_address_sql)).fetchone()[0]
    num_rows_fct_prices = conn.execute(sa.text(fct_prices)).fetchone()[0]

print(f"Created dim_address with {num_rows_dim_address:,} rows")
print(f"Created fct_prices with {num_rows_fct_prices:,} rows")

Created dim_address with 1,510,419 rows
Created fct_prices with 1,546,116 rows


In [55]:
fct_house_prices_t = catalog.load_table("house_prices.fct_house_prices")

In [56]:
import polars as pl
pl.config.

In [57]:
sold_address_ids = pl.scan_iceberg(fct_house_prices_t).group_by("address_id").len().filter(pl.col("len") > 1)

In [60]:
_.select("address_id")

address_id
str
"""D11C83301256CA709BCFBC7FDF5EB9…"
"""F736B69B44415DD5C0126D56E55A7F…"
"""770AAB0E234EE249362FBACD5AF247…"
"""F7782781F9B93E7C460DC743F1B69A…"
"""6718E17E744B01FB3B4F1B6F1517E2…"
…
"""310488721DDF74933F97EDD2323E6D…"
"""1351937E47CACB1ECEF622C6B0C587…"
"""34D34FE3EADFACBD0604AB58E8ABF5…"
"""0AD6188262C701C703A8CE7A0B660B…"


In [58]:
(
    pl.scan_iceberg(fct_house_prices_t)
    .join(sold_address_ids, on="address_id", how="inner")
    .sort("date_of_transfer").group_by("address_id").agg(
        pl.col("price").first().alias('first_price'),
        pl.col("price").last().alias("last_price"),
        (pl.col("price").last() - pl.col("price").first()).alias("profit")   
    )
    .collect()
)

address_id,first_price,last_price,profit
str,i32,i32,i32
"""D11C83301256CA709BCFBC7FDF5EB9…",280000,295000,15000
"""F736B69B44415DD5C0126D56E55A7F…",70000,70000,0
"""770AAB0E234EE249362FBACD5AF247…",145000,195000,50000
"""F7782781F9B93E7C460DC743F1B69A…",190000,205000,15000
"""6718E17E744B01FB3B4F1B6F1517E2…",175000,175000,0
…,…,…,…
"""310488721DDF74933F97EDD2323E6D…",175000,165000,-10000
"""1351937E47CACB1ECEF622C6B0C587…",85000,135000,50000
"""34D34FE3EADFACBD0604AB58E8ABF5…",80000,167500,87500
"""0AD6188262C701C703A8CE7A0B660B…",150000,315000,165000


In [53]:
query("""
WITH first_prices AS (
    SELECT 
        address_id, 
        price AS first_price,
        ROW_NUMBER() OVER (PARTITION BY address_id ORDER BY date_of_transfer) AS rn
    FROM house_prices.fct_house_prices
),
last_prices AS (
    SELECT 
        address_id, 
        price AS last_price,
        ROW_NUMBER() OVER (PARTITION BY address_id ORDER BY date_of_transfer DESC) AS rn
    FROM house_prices.fct_house_prices
),
address_ids AS (
    SELECT DISTINCT address_id 
    FROM house_prices.fct_house_prices
)
SELECT 
    a.address_id,
    f.first_price,
    l.last_price
FROM address_ids a
JOIN first_prices f ON f.address_id = a.address_id AND f.rn = 1
JOIN last_prices l ON l.address_id = a.address_id AND l.rn = 1
WHERE first_price != last_price
""")

address_id,first_price,last_price
str,i64,i64
"""8D948EBAD8344773E4C475307B9FE0…",600000,680000
"""6D627C9D6FCC2B13365706E16D59C1…",168000,180000
"""B46167FB6E8E1A6660FF5BC762A4BF…",71000,110000
"""44E7076FF472DE56E3D8C89746D052…",135000,142500
"""54F8E080E07DC6CDA2A667018084A3…",52000,80000
…,…,…
"""B3CC87F08337616EE4B2D4E2254C42…",270000,1500
"""BE16269AA83B04BC5E9E4A47474850…",187000,175000
"""61238F4BC763C097DA0216AE447F32…",425000,405000
"""4CECE9E2719C6D06FE57FED519DB86…",85000,105000


In [39]:
query("""
SELECT 
    address_id,
    MIN(price) FILTER (WHERE date_of_transfer = min_date) AS first_price,
    MAX(price) FILTER (WHERE date_of_transfer = max_date) AS last_price
FROM (
    SELECT 
        address_id, 
        price,
        date_of_transfer,
        MIN(date_of_transfer) OVER (PARTITION BY address_id) AS min_date,
        MAX(date_of_transfer) OVER (PARTITION BY address_id) AS max_date
    FROM house_prices.fct_house_prices
) t
GROUP BY address_id
""")

address_id,first_price,last_price
str,i64,i64
"""00008F6396C347426C4AC1CBABDD9F…",175000,175000
"""0000E9EC41DCE8A0ED4BB994432332…",450000,450000
"""00013FE44A6098DA94C5FBFBD51276…",180000,180000
"""000251C7DF7D243CAD7D1AB33A8508…",148000,148000
"""000274A8C9271FB09966E43CD71AB1…",567000,567000
…,…,…
"""FFFDC0D093DD71150783AD7B6AF8B7…",345000,345000
"""FFFE4030D86A19201FD3FED9084BDF…",491000,491000
"""FFFE7004514D99FC9EFACEECDCBC8D…",425000,425000
"""FFFECADB0FAD2FC4C7F54A2BB5E809…",217000,217000
