# Enriched Properties DataFrame

Reads the merged parquet from `sales_data/all_properties.parquet` and parses the `extra_features` JSON column into structured columns.

In [1]:
import sys
from pathlib import Path

# Add project root to path so we can import app modules
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT))

import pandas as pd
from app.feature_parser import parse_all_features

PARQUET_PATH = PROJECT_ROOT / "sales_data" / "all_properties.parquet"
df = pd.read_parquet(PARQUET_PATH)
print(f"Loaded {len(df)} rows, {len(df.columns)} columns")
df.head(3)

Loaded 1076 rows, 14 columns


Unnamed: 0,address,postcode,property_type,bedrooms,bathrooms,extra_features,floorplan_urls,url,date_sold,date_sold_iso,price,price_numeric,price_change_pct,tenure
0,"100, Beverley Way, West Wimbledon, London SW20...",SW20 0AQ,"Semi-detached,Freehold",4,2,"[""CHAIN FREE"", ""Semi-Detached Mock Tudor Style...",,https://www.rightmove.co.uk/house-prices/detai...,10 Aug 2012,2012-08-10,"£380,000",380000,,FREEHOLD
1,"100, Beverley Way, West Wimbledon, London SW20...",SW20 0AQ,"Semi-detached,Freehold",4,2,"[""CHAIN FREE"", ""Semi-Detached Mock Tudor Style...",,https://www.rightmove.co.uk/house-prices/detai...,9 Mar 2021,2021-03-09,"£650,000",650000,+71%,FREEHOLD
2,"101, Coombe Lane, London SW20 0BD",SW20 0BD,"Semi-detached,Freehold",0,0,,,https://www.rightmove.co.uk/house-prices/detai...,18 Aug 1995,1995-08-18,"£111,500",111500,,FREEHOLD


## Parse extra_features into structured columns

In [None]:
# Parse extra_features for each row
parsed = df["extra_features"].apply(parse_all_features).apply(pd.Series)

# Join parsed columns to original dataframe
enriched = pd.concat([df, parsed], axis=1)

NEW_COLS = list(parsed.columns)

print(f"Enriched: {len(enriched)} rows, {len(enriched.columns)} columns ({len(NEW_COLS)} new)")
enriched[["address", "postcode"] + NEW_COLS].head(10)

## Fill rates per column

In [3]:
fill_rates = enriched[NEW_COLS].notna().sum().to_frame("filled")
fill_rates["total"] = len(enriched)
fill_rates["pct"] = (fill_rates["filled"] / fill_rates["total"] * 100).round(1)
fill_rates.sort_values("pct", ascending=False)

Unnamed: 0,filled,total,pct
garden,624,1076,58.0
parking,372,1076,34.6
epc_rating,210,1076,19.5
lease_type,158,1076,14.7
chain_free,151,1076,14.0
double_glazed,105,1076,9.8
heating,99,1076,9.2
council_tax_band,96,1076,8.9
lease_years,72,1076,6.7
receptions,62,1076,5.8


## Value distributions

In [4]:
for col in NEW_COLS:
    filled = enriched[col].dropna()
    if len(filled) == 0:
        continue
    print(f"\n{'='*50}")
    print(f"{col} ({len(filled)} filled / {len(enriched)} total)")
    print(f"{'='*50}")
    print(filled.value_counts().head(10).to_string())



epc_rating (210 filled / 1076 total)
epc_rating
G    144
C     29
D     20
E     10
F      3
B      3
A      1

council_tax_band (96 filled / 1076 total)
council_tax_band
C    53
D    22
E    12
F     5
B     4

chain_free (151 filled / 1076 total)
chain_free
True     148
False      3

parking (372 filled / 1076 total)
parking
Off-street    130
Parking       109
Garage         89
Driveway       44

garden (624 filled / 1076 total)
garden
Garden             326
Private Garden     152
Balcony             68
Communal Garden     31
Patio               24
Terrace             23

heating (99 filled / 1076 total)
heating
Gas Central        58
Central Heating    24
Underfloor          9
Electric            4
Other               4

double_glazed (105 filled / 1076 total)
double_glazed
True    105

lease_type (158 filled / 1076 total)
lease_type
Leasehold            116
Share of Freehold     34
Freehold               8

lease_years (72 filled / 1076 total)
lease_years
114.0    9
985.0    9
88.0

## Save enriched dataframe

In [5]:
output_path = PROJECT_ROOT / "sales_data" / "enriched_properties.parquet"
enriched.to_parquet(output_path, engine="pyarrow", compression="snappy", index=False)
print(f"Saved to {output_path}")
print(f"  {len(enriched)} rows x {len(enriched.columns)} columns")
print(f"  Columns: {list(enriched.columns)}")

Saved to c:\Users\jacob\dev\rightmove-api\sales_data\enriched_properties.parquet
  1076 rows x 24 columns
  Columns: ['address', 'postcode', 'property_type', 'bedrooms', 'bathrooms', 'extra_features', 'floorplan_urls', 'url', 'date_sold', 'date_sold_iso', 'price', 'price_numeric', 'price_change_pct', 'tenure', 'epc_rating', 'council_tax_band', 'chain_free', 'parking', 'garden', 'heating', 'double_glazed', 'lease_type', 'lease_years', 'receptions']
