In [1]:
# 1) Imports & Paths
import pandas as pd
from pathlib import Path

ROOT      = Path().resolve().parents[0]
RAW_DIR   = ROOT / "data" / "raw"
CLEAN_DIR = ROOT / "data" / "clean"

In [2]:
# 2) Load raw parcel data
raw_path = RAW_DIR / "parcels_jefferson_monthly.csv"
df = pd.read_csv(raw_path, dtype=str)

In [3]:
# 3) Rename columns to snake_case
df = df.rename(columns={
    "snapshot_month":        "snapshot_month",
    "parcel_id":             "parcel_id",
    "tax_dist_code":         "tax_dist_code",
    "landuse":               "use_code",
    "proptyp":               "property_type",
    "pclass":                "property_class",
    "dweltyp":               "dwelling_type",
    "yearblt":               "year_built",
    "nostory":               "num_stories",
    "grade":                 "grade",
    "area_a":                "land_sqft",
    "market_value_land":     "market_value_land",
    "market_value_building": "market_value_building",
    "aexmtot":               "exemption_total",
    "apprtot":               "appraised_total",
    "tifmlnd":               "tif_value_land",
    "tifmbld":               "tif_value_building"
})

In [4]:
# 4) Parse snapshot_month into datetime (first of month)
df["snapshot_month"] = pd.to_datetime(df["snapshot_month"] + "-01", format="%Y-%m-%d")

In [5]:
# 5) Standardize tax district codes as zero-padded strings
df["tax_dist_code"] = (
    df["tax_dist_code"]
      .astype(float)
      .astype(int)
      .astype(str)
      .str.zfill(3)
)

In [6]:
# 6) Cast numeric columns
num_cols = [
    "use_code", "dwelling_type", "year_built", "num_stories",
    "land_sqft", "market_value_land", "market_value_building",
    "exemption_total", "appraised_total", "tif_value_land", "tif_value_building"
]
for col in num_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [7]:
# 7) Drop exact duplicates by (parcel_id, snapshot_month)
df = df.drop_duplicates(subset=["parcel_id", "snapshot_month"])

In [8]:
# 8) (Optional) Fill missing land_sqft with 0
df["land_sqft"] = df["land_sqft"].fillna(0)

In [9]:
# 9) Save cleaned dataset
output_path = CLEAN_DIR / "parcels_jefferson_monthly_clean.csv"
df.to_csv(output_path, index=False)
print(f"Cleaned parcel rows: {len(df)}")
df.head()

Cleaned parcel rows: 549234


Unnamed: 0,snapshot_month,parcel_id,tax_dist_code,use_code,property_type,property_class,dwelling_type,year_built,num_stories,grade,land_sqft,market_value_land,market_value_building,exemption_total,appraised_total,tif_value_land,tif_value_building
0,2019-01-01,027-000004,27,640.0,,E,,,,,0.0,32000.0,0.0,32000.0,0.0,0.0,0.0
1,2019-01-01,027-000006,27,471.0,,C,1.0,1965.0,10.0,C+1,2986.0,0.0,0.0,0.0,258000.0,0.0,0.0
2,2019-01-01,027-000012,27,511.0,,R,1.0,1997.0,10.0,B,2462.0,0.0,0.0,0.0,330200.0,0.0,0.0
3,2019-01-01,027-000018,27,550.0,,R,1.0,2005.0,20.0,C,1236.0,0.0,0.0,0.0,111400.0,0.0,0.0
4,2019-01-01,027-000019,27,550.0,,R,1.0,2005.0,20.0,C,1236.0,0.0,0.0,0.0,107400.0,0.0,0.0
