In [12]:
import sqlalchemy as sa
from utils import query, engine, catalog
import polars as pl
pl.Config.set_fmt_str_lengths(50)
pl.Config.set_thousands_separator(True)

polars.config.Config

In [13]:
import sqlalchemy as sa

dim_address_sql = """
CREATE OR REPLACE TABLE house_prices.dim_address 
    WITH ( partitioning = ARRAY['bucket(address_id, 10)'] )
    AS (
    SELECT DISTINCT to_hex(md5(cast(
        coalesce(paon, '') ||
        coalesce(saon, '') ||
        coalesce(street, '') ||
        coalesce(locality, '') ||
        coalesce(town, '') ||
        coalesce(district, '') ||
        coalesce(county, '') ||
        coalesce(postcode, '')
    as varbinary))) AS address_id,
      paon,
      saon,
      street,
      locality,
      town,
      district,
      county,
      postcode
FROM house_prices.raw)
"""

In [14]:
fct_prices_sql = """
CREATE OR REPLACE TABLE house_prices.fct_house_prices
    WITH ( partitioning = ARRAY['month(date_of_transfer)'] ) AS (
        WITH ranked_records AS (
            SELECT *,
            ROW_NUMBER () OVER (PARTITION BY transaction_id ORDER BY month(date_of_transfer) DESC) AS rn
            FROM house_prices.raw
    ),
    latest_records AS (
        SELECT *
        FROM ranked_records
        WHERE rn = 1
    ),
    with_address_id AS (
        SELECT to_hex(md5(cast (
                coalesce(paon, '') ||
                coalesce(saon, '') ||
                coalesce(street, '') ||
                coalesce(locality, '') ||
                coalesce(town, '') ||
                coalesce(district, '') ||
                coalesce(county, '') ||
                coalesce(postcode, '')
            as varbinary))) AS address_id,
                transaction_id,
                price,
                date_of_transfer,
                property_type,
                new_property,
                duration,
                ppd_category_type,
                record_status
        FROM latest_records
        WHERE record_status != 'D' and ppd_category_type = 'A'
    )
    SELECT *
    FROM with_address_id
    )
"""

In [15]:
with engine.begin() as conn:
    num_rows_dim_address = conn.execute(sa.text(dim_address_sql)).fetchone()[0]
    num_rows_fct_prices = conn.execute(sa.text(fct_prices_sql)).fetchone()[0]

print(f"Created dim_address with {num_rows_dim_address:,} rows")
print(f"Created fct_prices with {num_rows_fct_prices:,} rows")

Created dim_address with 19,686,507 rows
Created fct_prices with 28,430,311 rows


In [16]:
fct_house_prices_t = catalog.load_table("house_prices.fct_house_prices")

In [17]:
polars_result = (
    pl.scan_iceberg(fct_house_prices_t)
    .sort("date_of_transfer").group_by("address_id", maintain_order=True).agg(
        pl.col("date_of_transfer").first().alias("first_day"),
        pl.col("date_of_transfer").last().alias("last_day"),
        pl.col("price").first().alias('first_price'),
        pl.col("price").last().alias("last_price"),
        
        (pl.col("date_of_transfer").last() - pl.col("date_of_transfer").first()).dt.total_days().alias("days_held")
    ).filter(pl.col("first_day") != pl.col("last_day"))
        .with_columns(
            (pl.col("last_price") - pl.col("first_price")).alias("profit"),
            (pl.col("last_day") - pl.col("first_day")).dt.total_days().alias("days_held")
        )
    .collect()
)
polars_result

address_id,first_day,last_day,first_price,last_price,days_held,profit
str,date,date,i32,i32,i64,i32
"""3F3E05E14170C1790BB29FF6080F2F9E""",1995-01-01,2001-03-15,43000,68000,2265,25000
"""B34A17BFF459876877C5AB9355720669""",1995-01-01,2007-10-11,35000,302000,4666,267000
"""31BA6C4E2154FC4D7D68779F377BB833""",1995-01-01,2004-06-11,23000,110000,3449,87000
"""4ACFE8A0A9EDA3FB34633273FE500AC3""",1995-01-01,1998-02-27,50000,59995,1153,9995
"""D56AD400DEACA9B1EA7478D009D6A91D""",1995-01-01,2002-08-19,37000,115000,2787,78000
…,…,…,…,…,…,…
"""E70BFE2CF06C7D468FEB5F293FACCFB1""",2024-10-23,2024-11-19,177500,150000,27,-27500
"""CAE827901134DAD269FE040AA40E879C""",2024-10-28,2024-11-25,210000,210000,28,0
"""AB3539A7D8A553CD88D296C421D1A3F6""",2024-10-30,2024-11-29,435000,800000,30,365000
"""A6650BAD4B0F5041B7FBC254F780D151""",2024-10-31,2024-11-22,110000,119950,22,9950


In [18]:
profits_t = catalog.create_table_if_not_exists("house_prices.profits", schema=polars_result.to_arrow().schema)

In [19]:
profits_t.overwrite(polars_result.to_arrow())

In [20]:
def query_profits(year: int) -> pl.DataFrame:
    table = catalog.load_table("house_prices.profits")
    df = (
        pl.scan_iceberg(table)
            .filter(pl.col("first_day").dt.year() == year)
            .select(
                pl.col("profit").mean().alias("mean_house_profits"),
                
                )
    )
    return df.collect()

In [21]:
query_profits(2016)

mean_house_profits
f64
64690.471543
