In [73]:
import sqlalchemy as sa
from utils import query, engine, catalog
import polars as pl
pl.Config.set_fmt_str_lengths(50)

polars.config.Config

In [30]:
import sqlalchemy as sa

dim_address_sql = """
CREATE OR REPLACE TABLE house_prices.dim_address 
    WITH ( partitioning = ARRAY['bucket(address_id, 10)'] )
    AS (
    SELECT DISTINCT to_hex(md5(cast(
        coalesce(paon, '') ||
        coalesce(saon, '') ||
        coalesce(street, '') ||
        coalesce(locality, '') ||
        coalesce(town, '') ||
        coalesce(district, '') ||
        coalesce(county, '') ||
        coalesce(postcode, '')
    as varbinary))) AS address_id,
      paon,
      saon,
      street,
      locality,
      town,
      district,
      county,
      postcode
FROM house_prices.raw)
"""

In [31]:
fct_prices_sql = """
CREATE OR REPLACE TABLE house_prices.fct_house_prices
    WITH ( partitioning = ARRAY['month(date_of_transfer)'] ) AS (
        WITH ranked_records AS (
            SELECT *,
            ROW_NUMBER () OVER (PARTITION BY transaction_id ORDER BY month(date_of_transfer) DESC) AS rn
            FROM house_prices.raw
    ),
    latest_records AS (
        SELECT *
        FROM ranked_records
        WHERE rn = 1
    ),
    with_address_id AS (
        SELECT to_hex(md5(cast (
                coalesce(paon, '') ||
                coalesce(saon, '') ||
                coalesce(street, '') ||
                coalesce(locality, '') ||
                coalesce(town, '') ||
                coalesce(district, '') ||
                coalesce(county, '') ||
                coalesce(postcode, '')
            as varbinary))) AS address_id,
                transaction_id,
                price,
                date_of_transfer,
                property_type,
                new_property,
                duration,
                ppd_category_type,
                record_status
        FROM latest_records
        WHERE record_status != 'D' and ppd_category_type = 'A'
    )
    SELECT *
    FROM with_address_id
    )
"""

In [32]:
with engine.begin() as conn:
    num_rows_dim_address = conn.execute(sa.text(dim_address_sql)).fetchone()[0]
    num_rows_fct_prices = conn.execute(sa.text(fct_prices_sql)).fetchone()[0]

print(f"Created dim_address with {num_rows_dim_address:,} rows")
print(f"Created fct_prices with {num_rows_fct_prices:,} rows")

Created dim_address with 7,498,409 rows
Created fct_prices with 7,592,564 rows


In [47]:
fct_house_prices_t = catalog.load_table("house_prices.fct_house_prices")

In [61]:
polars_result = (
    pl.scan_iceberg(fct_house_prices_t)
    #.join(sold_address_ids, on="address_id", how="inner")
    .sort("date_of_transfer").group_by("address_id", maintain_order=True).agg(
        pl.col("date_of_transfer").first().alias("first_day"),
        pl.col("date_of_transfer").last().alias("last_day"),
        pl.col("price").first().alias('first_price'),
        pl.col("price").last().alias("last_price"),
        (pl.col("price").last() - pl.col("price").first()).alias("profit")   
    ).filter(pl.col("first_day") != pl.col("last_day"))
    .collect()
)
polars_result

address_id,first_day,last_day,first_price,last_price,profit
str,date,date,i32,i32,i32
"""523E32EA157C4EC2AD34B9F4E5AA59…",2015-01-01,2024-09-20,83000,242500,159500
"""D1E83E822AB222D7C60CA95A822A98…",2015-01-01,2023-01-06,80000,90000,10000
"""CE5328CF989858FC5D7ED07716F6F8…",2015-01-01,2021-12-17,296250,358000,61750
"""7BC097DC2832C45EBE135BCF913A17…",2015-01-01,2015-08-20,110000,146000,36000
"""03FC21590BEBD40929851C44DC9366…",2015-01-01,2024-02-19,80000,194000,114000
…,…,…,…,…,…
"""AB3539A7D8A553CD88D296C421D1A3…",2024-10-30,2024-11-29,435000,800000,365000
"""A6650BAD4B0F5041B7FBC254F780D1…",2024-10-31,2024-11-22,110000,119950,9950
"""809A689F9994FE55DEBB2E751DCD49…",2024-11-01,2024-12-20,147500,172500,25000
"""CBCB718361AC3FCE9C880CB9FAAFDE…",2024-11-28,2024-11-29,800000,800000,0


In [53]:
trino_df = query("""
WITH first_prices AS (
    SELECT 
        address_id, 
        price AS first_price,
        ROW_NUMBER() OVER (PARTITION BY address_id ORDER BY date_of_transfer) AS rn
    FROM house_prices.fct_house_prices
),
last_prices AS (
    SELECT 
        address_id, 
        price AS last_price,
        ROW_NUMBER() OVER (PARTITION BY address_id ORDER BY date_of_transfer DESC) AS rn
    FROM house_prices.fct_house_prices
),
address_ids AS (
    SELECT DISTINCT address_id 
    FROM house_prices.fct_house_prices
)
SELECT 
    a.address_id,
    f.first_price,
    l.last_price,
    l.last_price - f.first_price as profit
FROM address_ids a
JOIN first_prices f ON f.address_id = a.address_id AND f.rn = 1
JOIN last_prices l ON l.address_id = a.address_id AND l.rn = 1
WHERE first_price != last_price
""")

In [55]:
(
    polars_result.select(
        pl.col("address_id"), 
        pl.col("first_price"),
        pl.col("last_price"),
        pl.col("profit").alias("pl_profit")
    )
        .join(
            trino_df.select(
                pl.col("address_id"),
                pl.col("first_price"),
                pl.col("last_price"),
                pl.col("profit").alias("trino_profit")
            ), on="address_id", how="full")
        .filter(pl.col("pl_profit") != pl.col("trino_profit"))
        .select("address_id").to_series(0).to_list()
)

['79BB49ACA3FE0C605577A2074CD56809',
 '9772E37986862F846B870387916A2737',
 '31AF428411C23EA038B69B8DA892BF2D',
 '5761FBB2C75E40B81E7807B08A9C660B',
 '713F4B7BCD38B6A4353BEF0E14E1EDD6',
 '90E3266AEAE18E87EEFC4113E1A6EF36',
 '03FC97AF7DEC446F33F4E396710774BF',
 'EE7FD15C29D3E13585166D956F84EE4D',
 '66CBF8557B4297420D6A350DB7D891E0',
 'AE1F750B14E3E60B53EADD970A72E0D6',
 'B96B1DB1D20B28371F1F085F083F982D',
 '53EF34E033E27A65D002DA6E0AFF02F4',
 '56DA64BA4B25645D1867D7113FE3298E',
 '8EA1FD31C16ED5F3297801BFC51BD888',
 '0DA0EA865DAE8D7678D055EAF0988D65',
 'DFF079741B290D0FA041FCF040B88449',
 '4DDD2D3E8AF1B04874F3747C5F5191D9',
 '0AE4ED8E66DDC9BB674ABA9695F828FF',
 '07A5AAEDA2EA4BCC15D0787B9744A4CF',
 '37BEA26FB598442AB4ACCB2E355F2992',
 'C22320ACE9007E5B7A6C2F9F6FF23900',
 '2542908E8092D2DD1D3D2C1EB6E9A10E',
 'A20439D54C70D4D5E9F2A75D2548B94B',
 'A316214BB9E8635EDF6F6914B561C700',
 '0C45D3E18EC20C34D35A3836957EBC76',
 '0B479F68D2A3E6708F092E9220F84458',
 'D53D612F35ACBAA557E784C676B87B29',
 

In [47]:
query("""
with partitioned_prices as (
    SELECT 
        address_id, 
        price,
        date_of_transfer,
        MIN(date_of_transfer) OVER (PARTITION BY address_id) AS min_date,
        MAX(date_of_transfer) OVER (PARTITION BY address_id) AS max_date
    FROM house_prices.fct_house_prices),
first_and_last as (
SELECT 
    address_id,
    MIN(price) FILTER (WHERE date_of_transfer = min_date) AS first_price,
    MAX(price) FILTER (WHERE date_of_transfer = max_date) AS last_price
FROM partitioned_prices
GROUP BY address_id
)
select * from first_and_last where first_price != last_price
""")

address_id,first_price,last_price
str,i64,i64
"""00006B5897C5BB86759B556B52157C…",108000,124000
"""0000745F4634817B95572CB1756C94…",29000,40000
"""000091C535F9756E2D6582B40FAFEB…",230000,277000
"""000093AFECB41A4DBFBE332F1EC2FB…",361750,435000
"""0000A07625D3F4D73A89F33BA99239…",100000,105000
…,…,…
"""FFFD3478554CD9C5DA63BAFB3B42E8…",186915,301450
"""FFFD45E32BD27F0642CAB14CF7F218…",475000,451000
"""FFFDED4D01596AFB92B956E66DD5B7…",87500,85000
"""FFFF70E31671842771963D9E5B3849…",61000,100000
