# 03_prepare_final (housing / CSV)
**Authors**: <fill names> | **Owner**: <name> | **Reviewer**: <name> | **Date**: 2025-11-07

**Purpose:** Deterministic pipeline CSV -> CSV

> These notebooks assume the first ingest saved **CSV** outputs into `intermediate_data/` (not Parquet).
> Paths used:
> - Raw CSV: `price_paid_records/price_paid_records.csv`
> - Combined CSV: `intermediate_data/housing_all.csv`
> - Optional partitioned by year: `intermediate_data/partitioned_csv/year=YYYY.csv`


In [1]:
!pip install pandas --quiet


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\User\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:

from pathlib import Path
import pandas as pd

INT_DIR = Path("intermediate_data")
PROC_DIR = Path("processed_data")
PROC_DIR.mkdir(parents=True, exist_ok=True)

csv_path = INT_DIR / "housing_all.csv"
assert csv_path.exists(), f"Expected combined CSV at: {csv_path}"

df = pd.read_csv(csv_path, low_memory=False)

def normalize_columns(df):
    df = df.copy()
    df.columns = (
        df.columns.str.strip()
                  .str.lower()
                  .str.replace(r"\s+", "_", regex=True)
                  .str.replace(r"[^0-9a-zA-Z_]+", "", regex=True)
    )
    return df

def pipe_clean(df):
    df = normalize_columns(df).copy()

    if "price" in df.columns:
        df["price"] = pd.to_numeric(df["price"], errors="coerce")
    for c in ["region","property_type","tenure"]:
        if c in df.columns:
            df[c] = df[c].astype("category")

    for c in ["year","month"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce").astype("Int64")

    if set(["price","region","month"]).issubset(df.columns):
        df["price"] = df.groupby(["region","month"])["price"].transform(lambda s: s.fillna(s.median()))
        df["price"] = df["price"].fillna(df["price"].median())

    if "price" in df.columns:
        q1, q3 = df["price"].quantile([0.25, 0.75])
        iqr = q3 - q1
        lo, hi = q1 - 1.5*iqr, q3 + 1.5*iqr
        df["price"] = df["price"].clip(lo, hi)

    df["is_new_build"] = False
    for c in df.columns:
        if "new" in c and df[c].dtype == "object":
            df["is_new_build"] = df[c].str.contains("new", case=False, na=False)
            break

    for c in ["unnamed_0","index"]:
        if c in df.columns:
            df = df.drop(columns=c)

    return df

df = pipe_clean(df)

target = "price"
base_cols = [c for c in df.columns if c != target]
keep = []
for c in base_cols:
    if df[c].dtype.kind in "biufc":
        keep.append(c)
    elif str(df[c].dtype) == "category":
        keep.append(c)

model_df = df[keep + [target]].dropna()
out_path = PROC_DIR / "housing_model_ready.csv"
model_df.to_csv(out_path, index=False)
print("Wrote:", out_path, "rows:", len(model_df), "cols:", len(model_df.columns))


  df["price"] = df.groupby(["region","month"])["price"].transform(lambda s: s.fillna(s.median()))


Wrote: processed_data\housing_model_ready.csv rows: 21550209 cols: 6


In [3]:
from pathlib import Path
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Folders (match what you used earlier)
INT_DIR = Path("intermediate_data")
PROC_DIR = Path("processed_data")
INT_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

# ---- 2A) Save the fully cleaned dataset (before column filtering) ----
# Assumes your pipeline variable is named `df` after pipe_clean(...)
clean_csv = INT_DIR / "housing_all_cleaned.csv"
clean_parquet = INT_DIR / "housing_all_cleaned.parquet"

df.to_csv(clean_csv, index=False)
pq.write_table(pa.Table.from_pandas(df, preserve_index=False), clean_parquet)

print("Saved cleaned CSV  ->", clean_csv)
print("Saved cleaned PARQ ->", clean_parquet)

# ---- 2B) Save the model-ready dataset (after selecting/encoding columns) ----
# Assumes your final modeling frame is `model_df`
model_csv = PROC_DIR / "housing_model_ready.csv"        # you probably already create this
model_parquet = PROC_DIR / "housing_model_ready.parquet"

# (Re-save CSV to be sure it's fresh; or comment this out if you already wrote it)
model_df.to_csv(model_csv, index=False)

pq.write_table(pa.Table.from_pandas(model_df, preserve_index=False), model_parquet)

print("Saved model-ready CSV  ->", model_csv)
print("Saved model-ready PARQ ->", model_parquet)


Saved cleaned CSV  -> intermediate_data\housing_all_cleaned.csv
Saved cleaned PARQ -> intermediate_data\housing_all_cleaned.parquet
Saved model-ready CSV  -> processed_data\housing_model_ready.csv
Saved model-ready PARQ -> processed_data\housing_model_ready.parquet
