In [11]:
from pathlib import Path
import zipfile, pandas as pd, numpy as np

# ── 1. Project paths ──────────────────────────────────────────────────────
ROOT     = Path.cwd()
while not (ROOT / ".gitignore").exists() and ROOT != ROOT.parent:
    ROOT = ROOT.parent

RAW_DIR   = ROOT / "data" / "raw"
ZIP_PATH  = RAW_DIR / "parcel-data.zip"
CSV_DIR   = RAW_DIR / "parcel-data"           # if you have extracted the CSVs
EXTRACTED = CSV_DIR.exists()                  # auto‑detect

In [12]:
# ── 2. Config ─────────────────────────────────────────────────────────────
YEARS    = range(2018, 2025)  # only keep 2018–2024
JT_CODES = {"027","062","067","068","170","171","172","173","175","440"}

# include STADDR (situs address) so we can normalize it
KEEP_COLS = [
    "PARCEL ID", "STADDR", "USPS_CITY", "TAXDESI", "SCHOOL",
    "LANDUSE", "PROPTYP", "PCLASS",
    "AREA_A", "NOSTORY", "YEARBLT", "ROOMS", "BEDRMS", "BATHS",
    "APPRLND", "APPRBLD", "APPRTOT",
    "POINT_X", "POINT_Y"
]

In [13]:
# ── 3. Helper functions ───────────────────────────────────────────────────
def clean_names(cols: pd.Index) -> pd.Index:
    return (cols.str.strip()
                .str.lower()
                .str.replace(r"[^\w]+", "_", regex=True)
                .str.strip("_"))

def normalize_addr(s: pd.Series) -> pd.Series:
    s = s.str.upper().fillna("")
    s = s.str.replace(r"\b(?:NORTH|SOUTH|EAST|WEST|N|S|E|W)\b", "", regex=True)
    s = s.str.replace(r"\s+\d{5}$", "", regex=True)
    s = s.str.replace(r"[,\.\#]", " ", regex=True)
    suffix_map = {
        r"\bST\b": "STREET", r"\bRD\b": "ROAD",
        r"\bAVE\b": "AVENUE", r"\bBLVD\b": "BOULEVARD",
        r"\bDR\b": "DRIVE", r"\bLN\b": "LANE",
        r"\bCT\b": "COURT", r"\bPKWY\b": "PARKWAY",
        r"\bCTR\b": "CENTER", r"\bTER\b": "TERRACE"
    }
    for abbr, full in suffix_map.items():
        s = s.str.replace(abbr, full, regex=True)
    return s.str.replace(r"\s+", " ", regex=True).str.strip()

def load_year(year: int) -> pd.DataFrame:
    fname = f"Parcel{year}.csv"
    if EXTRACTED:
        path = CSV_DIR / fname
        df = pd.read_csv(path, usecols=KEEP_COLS, dtype=str, low_memory=False)
    else:
        with zipfile.ZipFile(ZIP_PATH) as z:
            target = next(n for n in z.namelist() if n.lower().endswith(fname.lower()))
            with z.open(target) as f:
                df = pd.read_csv(f, usecols=KEEP_COLS, dtype=str, low_memory=False)
    df["source_year"] = year
    return df

In [14]:
# ── 4. Load & concatenate all years ───────────────────────────────────────
frames = []
for y in YEARS:
    print(f"Loading parcels {y}…", end=" ")
    frames.append(load_year(y))
    print("done")
parcels = pd.concat(frames, ignore_index=True)
parcels.columns = clean_names(parcels.columns)
print(f"{len(parcels):,} total rows loaded")

Loading parcels 2018… done
Loading parcels 2019… done
Loading parcels 2020… done
Loading parcels 2021… done
Loading parcels 2022… done
Loading parcels 2023… done
Loading parcels 2024… done
3,051,988 total rows loaded


In [15]:
# ── 5. Filter to Jefferson Township ───────────────────────────────────────
parcels["parcel_id"] = parcels["parcel_id"].str.strip()
parcels["taxdist_code"] = (
    parcels["parcel_id"]
            .str.split(pat="-", n=1).str[0]
            .str.zfill(3)
)
jt = parcels[parcels["taxdist_code"].isin(JT_CODES)].copy()
print(f"{len(jt):,} Jefferson rows ({len(jt)/len(parcels):.1%} of all)")

45,257 Jefferson rows (1.5% of all)


In [16]:
# ── 6. Clean & cast types ─────────────────────────────────────────────────
num_cols = ["area_a","apprlnd","apprbld","apprtot",
            "point_x","point_y","yearblt","rooms","bedrms","baths","nostory"]
for c in num_cols:
    jt[c] = pd.to_numeric(jt[c], errors="coerce")
jt.loc[jt["area_a"] <= 0, "area_a"] = np.nan
jt.loc[~jt["yearblt"].between(1850,2025), "yearblt"] = np.nan

jt["landuse"] = jt["landuse"].astype("Int64").astype("category")
for c in ["proptyp","pclass"]:
    jt[c] = jt[c].astype("category")

# standardize situs address
jt["staddr_norm"] = normalize_addr(jt["staddr"])
jt["usps_city"]   = jt["usps_city"].str.title().fillna("")

# volume proxy
jt["stories_x_sqft"] = jt["nostory"] * jt["area_a"]

In [17]:
# ── 7. Year‑over‑year Δsqft ────────────────────────────────────────────────
jt = (jt.sort_values(["parcel_id","source_year"])
        .assign(
            sqft_prev  = lambda d: d.groupby("parcel_id")["area_a"].shift(),
            delta_sqft = lambda d: d["area_a"] - d["sqft_prev"]
        ))

In [20]:
# ── 8. Save panel & latest snapshot ──────────────────────────────────────
OUT = ROOT / "data" / "clean"
OUT.mkdir(parents=True, exist_ok=True)

panel_csv  = OUT / "parcels_jefferson_panel.csv"
latest_csv = OUT / "parcels_jefferson_latest.csv"

jt.to_csv(panel_csv, index=False)
print("✓ panel saved to", panel_csv)

jt_latest = (jt.sort_values("source_year")
               .drop_duplicates("parcel_id", keep="last"))
jt_latest.to_csv(latest_csv, index=False)
print("✓ latest snapshot saved to", latest_csv)

✓ panel saved to C:\Users\JosephWhite\Documents\GitHub\jefferson-township-run-forecasting\data\clean\parcels_jefferson_panel.csv
✓ latest snapshot saved to C:\Users\JosephWhite\Documents\GitHub\jefferson-township-run-forecasting\data\clean\parcels_jefferson_latest.csv
