In [11]:
# ── 0. Imports ────────────────────────────────────────────────────────────
import pandas as pd, zipfile, sys
from pathlib import Path

In [12]:
# ── 1. Project paths ──────────────────────────────────────────────────────
ROOT = Path.cwd()                             # wherever the notebook lives
# Walk up until we reach the repo root (where .gitignore is). If you’re
# not inside a git repo, comment out the while‑loop and set ROOT manually.
while not (ROOT / ".gitignore").exists() and ROOT != ROOT.parent:
    ROOT = ROOT.parent

RAW_DIR   = ROOT / "data" / "raw"
ZIP_PATH  = RAW_DIR / "parcel-data.zip"
CSV_DIR   = RAW_DIR / "parcel-data"           # if you extracted the CSVs
EXTRACTED = CSV_DIR.exists()                  # auto‑detect

YEARS = range(2014, 2025)

KEEP_COLS = [
    "PARCEL ID", "AUDMAP", "AUDRTG", "TAXDESI", "SCHOOL", "USPS_CITY",
    "LANDUSE", "PROPTYP", "PCLASS",
    "AREA_A", "ACREA", "ROOMS", "BEDRMS", "BATHS",
    "YEARBLT", "DWELTYP", "COND", "GRADE",
    "APPRLND", "APPRBLD", "APPRTOT", "POINT_X", "POINT_Y",
]

def clean_names(cols: pd.Index) -> pd.Index:
    return (cols.str.strip()
                .str.lower()
                .str.replace(r"[^\w]+", "_", regex=True)
                .str.strip("_"))

def load_year(year: int) -> pd.DataFrame:
    fname = f"Parcel{year}.csv"
    if EXTRACTED:
        path = CSV_DIR / fname
        if not path.exists():
            raise FileNotFoundError(f"{path} missing—check EXTRACTED flag.")
        df = pd.read_csv(path, usecols=KEEP_COLS, dtype=str, low_memory=False)
    else:
        if not ZIP_PATH.exists():
            raise FileNotFoundError(f"{ZIP_PATH} not found.")
        target = next((n for n in zipfile.ZipFile(ZIP_PATH).namelist()
                       if n.lower().endswith(fname.lower())), None)
        if target is None:
            raise FileNotFoundError(f"{fname} not in {ZIP_PATH.name}")
        with zipfile.ZipFile(ZIP_PATH).open(target) as f:
            df = pd.read_csv(f, usecols=KEEP_COLS, dtype=str, low_memory=False)
    df["source_year"] = year
    return df

In [13]:
# ── 2. Load & concat ──────────────────────────────────────────────────────
frames = []
for yr in YEARS:
    print(f"Loading {yr}…", end=" ")
    frames.append(load_year(yr))
print("done.")

parcels = pd.concat(frames, ignore_index=True)
parcels.columns = clean_names(parcels.columns)
print(f"{len(parcels):,} rows × {parcels.shape[1]} cols loaded.")

# ── 3. Basic type fixes ───────────────────────────────────────────────────
num_cols = ["area_a", "acrea", "apprlnd", "apprbld", "apprtot",
            "point_x", "point_y", "yearblt", "rooms", "bedrms", "baths"]
for col in num_cols:
    parcels[col] = pd.to_numeric(parcels[col], errors="coerce")

cat_cols = ["landuse", "pclass", "proptyp", "dweltyp", "cond", "grade"]
for col in cat_cols:
    parcels[col] = parcels[col].astype("category")

parcels["parcel_id"] = parcels["parcel_id"].str.strip()

Loading 2014… Loading 2015… Loading 2016… Loading 2017… Loading 2018… Loading 2019… Loading 2020… Loading 2021… Loading 2022… Loading 2023… Loading 2024… done.
4,645,689 rows × 24 cols loaded.


In [14]:
# ── 4. Filter to Jefferson‑area tax districts ─────────────────────────────
# district numbers you supplied (keep leading zeros for clarity)
JT_CODES = {"027", "062", "067", "068", "170",
            "171", "172", "173", "175", "440"}

# add a helper column with the 3‑digit code (string, zero‑padded)
parcels["taxdist_code"] = (
    parcels["parcel_id"].str.split("-").str[0].str.zfill(3)
)

# subset
jt_parcels = parcels[parcels["taxdist_code"].isin(JT_CODES)].copy()

print(f"{len(jt_parcels):,} Jefferson‑area rows "
      f"({len(jt_parcels)/len(parcels):.2%} of all rows)")

66,556 Jefferson‑area rows (1.43% of all rows)


In [15]:
# ── 5. Save Jefferson‑only CSVs (uncompressed) ────────────────────────────
OUT_DIR = ROOT / "data" / "clean"
OUT_DIR.mkdir(parents=True, exist_ok=True)

panel_csv  = OUT_DIR / "parcels_jefferson_panel.csv"
latest_csv = OUT_DIR / "parcels_jefferson_latest.csv"

jt_parcels.to_csv(panel_csv, index=False)
print("✓ panel  →", panel_csv)

jt_latest = (jt_parcels.sort_values("source_year")
                         .drop_duplicates("parcel_id", keep="last"))
jt_latest.to_csv(latest_csv, index=False)
print("✓ latest →", latest_csv)

✓ panel  → C:\Users\JosephWhite\Documents\GitHub\jefferson-township-run-forecasting\data\clean\parcels_jefferson_panel.csv
✓ latest → C:\Users\JosephWhite\Documents\GitHub\jefferson-township-run-forecasting\data\clean\parcels_jefferson_latest.csv
