In [31]:
import pathlib, re, pandas as pd
from tqdm.auto import tqdm
from datetime import datetime

# ── PATHS ────────────────────────────────────────────────────────────────
ROOT      = pathlib.Path().resolve().parents[0]          # repo root
RAW_DIR   = ROOT / "data" / "raw" / "parcel-data"
CLEAN_DIR = ROOT / "data" / "clean"
CLEAN_DIR.mkdir(parents=True, exist_ok=True)
OUT_CSV   = CLEAN_DIR / "parcels_jefferson_monthly.csv"

In [32]:
# ── JEFFERSON TAX-DIST CODES (your expanded list) ───────────────────────
JT_CODES = {"027","062","067","068","170","171","172","173","175","440"}

# ── CANONICAL COLUMN ORDER ──────────────────────────────────────────────
CANON = [
    "snapshot_month",
    "parcel_id", "tax_dist_code",
    "landuse", "proptyp", "pclass",       # new categorical exposures
    "dweltyp",                             # dwelling type
    "yearblt", "nostory", "grade",         # building age / height / quality
    "area_a",                              # square footage
    "market_value_land", "market_value_building",
    "aexmtot", "apprtot",                  # total market & appraised value
    "tifmlnd", "tifmbld",                  # TIF assessed values
]

# ── ALIAS MAP (messy → tidy) ────────────────────────────────────────────
ALIASES = {
    # parcel id
    "parid":"parcel_id","parcelid":"parcel_id",
    "parcelnumber":"parcel_id","parcel id":"parcel_id",
    # tax district
    "taxdistrict":"tax_dist_code","tax_district":"tax_dist_code",
    "taxdist":"tax_dist_code",
    # land use + prop type
    "land_use":"landuse","landusecode":"landuse",
    "prop_type":"proptyp","proptype":"proptyp",
    # pclass often already lowercase
    # square footage
    "totbldgsqft":"area_a","tot_bldg_sqft":"area_a","tot_sf":"area_a",
    # market & appraised values
    "aexmlnd":"market_value_land","aexmbld":"market_value_building",
    "marketvalue_land":"market_value_land",
    "marketvaluebuilding":"market_value_building",
    "aexmtot":"aexmtot","marketvalue_total":"aexmtot",
    "apprtot":"apprtot","appr_total":"apprtot",
    # tif values
    "tifmlnd":"tifmlnd","tifmbld":"tifmbld",
}

MONTH_RE = re.compile(r"parcel_(\d{2})_(\d{4})\.csv", re.I)

In [33]:
# ── HELPERS ──────────────────────────────────────────────────────────────
def snapshot_from_fname(fname: pathlib.Path) -> pd.Period:
    mm, yyyy = map(int, MONTH_RE.search(fname.name).groups())
    return pd.Period(f"{yyyy}-{mm:02d}", freq="M")

def load_and_clean(fp: pathlib.Path) -> pd.DataFrame:
    """Return Jefferson rows with columns = CANON (missing cols → NaN)."""
    df = pd.read_csv(fp, dtype=str, low_memory=False)

    # 1 ▸ normalise header
    df.columns = (df.columns.str.lower()
                             .str.strip()
                             .str.replace(r"\s+", "", regex=True))
    df.rename(columns={c: ALIASES.get(c, c) for c in df.columns}, inplace=True)

    # 2 ▸ ensure parcel_id
    if "parcel_id" not in df.columns:
        guess = [c for c in df.columns if re.fullmatch(r"par.*(id|cel)", c)]
        if not guess:
            return pd.DataFrame()
        df.rename(columns={guess[0]: "parcel_id"}, inplace=True)

    # 3 ▸ derive tax_dist_code if missing
    if "tax_dist_code" not in df.columns:
        df["tax_dist_code"] = (
            df["parcel_id"].fillna("")
                          .str.extract(r"^(\d{1,3})")[0]
                          .str.zfill(3)
        )

    # 4 ▸ Jefferson slice
    df = df[df["tax_dist_code"].isin(JT_CODES)]
    if df.empty:
        return df

    # 5 ▸ guarantee all canonical cols exist
    for col in CANON:
        if col not in df.columns:
            df[col] = pd.NA
    df = df[[c for c in CANON if c != "snapshot_month"]]

    # 6 ▸ numeric coercions
    num = [
        "area_a","market_value_land","market_value_building",
        "aexmtot","apprtot","tifmlnd","tifmbld",
        "yearblt","nostory"
    ]
    df[num] = df[num].apply(pd.to_numeric, errors="coerce")

    df.insert(0, "snapshot_month", snapshot_from_fname(fp))
    return df

In [34]:
# ── MAIN STREAM LOOP ────────────────────────────────────────────────────
def main():
    files  = sorted(RAW_DIR.glob("parcel_*.csv"))
    if not files:
        print("No monthly CSVs in", RAW_DIR)
        return

    first = True
    for f in tqdm(files, desc="Processing", unit="file"):
        try:
            chunk = load_and_clean(f)
            if chunk.empty:
                continue
            chunk.to_csv(
                OUT_CSV,
                mode="w" if first else "a",
                header=first,
                index=False
            )
            first = False
        except Exception as ex:
            tqdm.write(f"⚠ {f.name}: {ex}")

    if first:
        print("No Jefferson rows found.")
    else:
        mb = OUT_CSV.stat().st_size / 1024**2
        print(f"✓ Clean file → {OUT_CSV}  ({mb:,.1f} MB)")

if __name__ == "__main__":
    t0 = datetime.now()
    main()
    print("Finished in", datetime.now() - t0)

Processing:   0%|          | 0/84 [00:00<?, ?file/s]

✓ Clean file → C:\Repositories\jefferson-township-run-forecasting\data\clean\parcels_jefferson_monthly.csv  (42.6 MB)
Finished in 0:15:34.256764
