In [180]:
import sqlalchemy as sa
from utils import query, engine, catalog
import polars as pl
pl.Config.set_fmt_str_lengths(50)
pl.Config.set_thousands_separator(True)

polars.config.Config

In [30]:
import sqlalchemy as sa

dim_address_sql = """
CREATE OR REPLACE TABLE house_prices.dim_address 
    WITH ( partitioning = ARRAY['bucket(address_id, 10)'] )
    AS (
    SELECT DISTINCT to_hex(md5(cast(
        coalesce(paon, '') ||
        coalesce(saon, '') ||
        coalesce(street, '') ||
        coalesce(locality, '') ||
        coalesce(town, '') ||
        coalesce(district, '') ||
        coalesce(county, '') ||
        coalesce(postcode, '')
    as varbinary))) AS address_id,
      paon,
      saon,
      street,
      locality,
      town,
      district,
      county,
      postcode
FROM house_prices.raw)
"""

In [31]:
fct_prices_sql = """
CREATE OR REPLACE TABLE house_prices.fct_house_prices
    WITH ( partitioning = ARRAY['month(date_of_transfer)'] ) AS (
        WITH ranked_records AS (
            SELECT *,
            ROW_NUMBER () OVER (PARTITION BY transaction_id ORDER BY month(date_of_transfer) DESC) AS rn
            FROM house_prices.raw
    ),
    latest_records AS (
        SELECT *
        FROM ranked_records
        WHERE rn = 1
    ),
    with_address_id AS (
        SELECT to_hex(md5(cast (
                coalesce(paon, '') ||
                coalesce(saon, '') ||
                coalesce(street, '') ||
                coalesce(locality, '') ||
                coalesce(town, '') ||
                coalesce(district, '') ||
                coalesce(county, '') ||
                coalesce(postcode, '')
            as varbinary))) AS address_id,
                transaction_id,
                price,
                date_of_transfer,
                property_type,
                new_property,
                duration,
                ppd_category_type,
                record_status
        FROM latest_records
        WHERE record_status != 'D' and ppd_category_type = 'A'
    )
    SELECT *
    FROM with_address_id
    )
"""

In [32]:
with engine.begin() as conn:
    num_rows_dim_address = conn.execute(sa.text(dim_address_sql)).fetchone()[0]
    num_rows_fct_prices = conn.execute(sa.text(fct_prices_sql)).fetchone()[0]

print(f"Created dim_address with {num_rows_dim_address:,} rows")
print(f"Created fct_prices with {num_rows_fct_prices:,} rows")

Created dim_address with 7,498,409 rows
Created fct_prices with 7,592,564 rows


In [181]:
fct_house_prices_t = catalog.load_table("house_prices.fct_house_prices")

In [182]:
polars_result = (
    pl.scan_iceberg(fct_house_prices_t)
    .sort("date_of_transfer").group_by("address_id", maintain_order=True).agg(
        pl.col("date_of_transfer").first().alias("first_day"),
        pl.col("date_of_transfer").last().alias("last_day"),
        pl.col("price").first().alias('first_price'),
        pl.col("price").last().alias("last_price"),
        
        (pl.col("date_of_transfer").last() - pl.col("date_of_transfer").first()).dt.total_days().alias("days_held")
    ).filter(pl.col("first_day") != pl.col("last_day"))
        .with_columns(
            (pl.col("last_price") - pl.col("first_price")).alias("profit"),
            (pl.col("last_day") - pl.col("first_day")).dt.total_days().alias("days_held")
        )
    .collect()
)
polars_result



address_id,first_day,last_day,first_price,last_price,days_held,profit
str,date,date,i32,i32,i64,i32
"""523E32EA157C4EC2AD34B9F4E5AA59F5""",2015-01-01,2024-09-20,83000,242500,3550,159500
"""D1E83E822AB222D7C60CA95A822A98EB""",2015-01-01,2023-01-06,80000,90000,2927,10000
"""CE5328CF989858FC5D7ED07716F6F86E""",2015-01-01,2021-12-17,296250,358000,2542,61750
"""7BC097DC2832C45EBE135BCF913A17CE""",2015-01-01,2015-08-20,110000,146000,231,36000
"""03FC21590BEBD40929851C44DC9366CB""",2015-01-01,2024-02-19,80000,194000,3336,114000
…,…,…,…,…,…,…
"""AB3539A7D8A553CD88D296C421D1A3F6""",2024-10-30,2024-11-29,435000,800000,30,365000
"""A6650BAD4B0F5041B7FBC254F780D151""",2024-10-31,2024-11-22,110000,119950,22,9950
"""809A689F9994FE55DEBB2E751DCD49C1""",2024-11-01,2024-12-20,147500,172500,49,25000
"""CBCB718361AC3FCE9C880CB9FAAFDE22""",2024-11-28,2024-11-29,800000,800000,1,0


In [154]:
profits_t = catalog.create_table_if_not_exists("house_prices.profits", schema=polars_result.to_arrow().schema)

In [155]:
profits_t.append(polars_result.to_arrow())



In [172]:
def query_profits(year: int) -> pl.DataFrame:
    table = catalog.load_table("house_prices.profits")
    df = (
        pl.scan_iceberg(table)
            .filter(pl.col("first_day").dt.year() == year)
            .select(
                pl.col("profit").mean().alias("mean_house_profits"),
                
                )
    )
    return df.collect()

In [173]:
query_profits(2016)

mean_house_profits
f64
54172.897164
