In [6]:
import pandas as pd
import zipfile
from pathlib import Path

# ── Config ───────────────────────────────────────────────────────────────
ZIP_PATH = Path(r"C:\Users\JosephWhite\Documents\GitHub\jefferson-township-run-forecasting\data\raw\parcel-data.zip")
JEFFERSON_CODES = {"027","062","067","068","170","171","172","173","175","440"}
OUT_PATH = Path(r"C:\Users\JosephWhite\Documents\GitHub\jefferson-township-run-forecasting\data\clean\parcels_jefferson_cleaned.csv")

# ── Helper: Clean column names ───────────────────────────────────────────
def clean_column_names(df):
    return df.rename(columns=lambda c: str(c).strip().lower().replace(" ", "_"))

# ── Helper: Convert numeric fields safely ────────────────────────────────
def to_float(series):
    return pd.to_numeric(series.astype(str).str.replace(",", "").str.strip(), errors="coerce")

In [7]:
# ── Process all yearly parcel files ──────────────────────────────────────
dfs = []

with zipfile.ZipFile(ZIP_PATH, "r") as archive:
    parcel_files = [f for f in archive.namelist() if "parcel-data/Parcel" in f and f.endswith(".csv")]

    for file in sorted(parcel_files):
        year = Path(file).stem.replace("Parcel", "")
        print(f"📄 Reading {file} for year {year}")
        with archive.open(file) as f:
            df = pd.read_csv(f, dtype=str)
            df = clean_column_names(df)
            df["year"] = int(year)

            # Extract tax district code (e.g., 010-123456 → "010")
            df["taxdist_code"] = df["parcel_id"].str.split("-", n=1).str[0].str.zfill(3)

            # Filter to Jefferson Township
            df = df[df["taxdist_code"].isin(JEFFERSON_CODES)].copy()

            # Convert numeric fields
            for col in ["aexmlnd", "aexmbld", "aexmtot", "apprlnd", "apprbld", "apprtot", "acreage", "price", "yearblt"]:
                if col in df.columns:
                    df[col] = to_float(df[col])

            # Optional: construct full address
            df["full_address"] = (
                df["sthnum"].fillna("").str.strip() + " " +
                df["stdire"].fillna("").str.strip() + " " +
                df["stname"].fillna("").str.strip() + " " +
                df["stsfx"].fillna("").str.strip()
            ).str.replace(r"\s+", " ", regex=True).str.strip()

            dfs.append(df)

📄 Reading parcel-data/Parcel2018.csv for year 2018
📄 Reading parcel-data/Parcel2019.csv for year 2019
📄 Reading parcel-data/Parcel2020.csv for year 2020
📄 Reading parcel-data/Parcel2021.csv for year 2021
📄 Reading parcel-data/Parcel2022.csv for year 2022
📄 Reading parcel-data/Parcel2023.csv for year 2023
📄 Reading parcel-data/Parcel2024.csv for year 2024
📄 Reading parcel-data/Parcel2025.csv for year 2025


In [8]:
# ── Combine yearly Jefferson records ─────────────────────────────────────
combined = pd.concat(dfs, ignore_index=True)

# ── Save to CSV ──────────────────────────────────────────────────────────
combined.to_csv(OUT_PATH, index=False)
print(f"✅ Saved cleaned Jefferson Township panel → {OUT_PATH}")

✅ Saved cleaned Jefferson Township panel → C:\Users\JosephWhite\Documents\GitHub\jefferson-township-run-forecasting\data\clean\parcels_jefferson_cleaned.csv
