In [18]:
# ── 0. Imports ────────────────────────────────────────────────────────────
from pathlib import Path
import zipfile, pandas as pd
from functools import reduce

In [19]:
# ── 1. Locate the project root & raw‑data zip ─────────────────────────────
ROOT = Path.cwd()
while not (ROOT / ".gitignore").exists() and ROOT != ROOT.parent:
    ROOT = ROOT.parent                       # walk up until we hit the repo root

RAW  = ROOT / "data" / "raw"
ZIP  = RAW  / "parcel-data.zip"

assert ZIP.exists(), f"❌  Can't find {ZIP}"

In [20]:
# ── 2. Helper: load one CSV straight from the zip ─────────────────────────
def load_parcel(year: int, **read_csv_kwargs) -> pd.DataFrame:
    """Read ParcelYYYY.csv from *any* depth inside parcel‑data.zip."""
    target = None
    with zipfile.ZipFile(ZIP) as z:
        # Locate the first member whose name ends with ParcelYYYY.csv (case‑insensitive)
        year_file = f"parcel{year}.csv".lower()
        for name in z.namelist():
            if name.lower().endswith(year_file):
                target = name
                break
        if target is None:
            raise FileNotFoundError(f"{year_file} not found in {ZIP.name}")

        with z.open(target) as f:
            return pd.read_csv(f, low_memory=False, **read_csv_kwargs)

In [21]:
# ── 3. Quick look: what files are inside? ─────────────────────────────────
with zipfile.ZipFile(ZIP) as z:
    print("📦 files in parcel‑data.zip:")
    for name in z.namelist():
        print("  •", name)

📦 files in parcel‑data.zip:
  • parcel-data/
  • parcel-data/Parcel2014.csv
  • parcel-data/Parcel2015.csv
  • parcel-data/Parcel2016.csv
  • parcel-data/Parcel2017.csv
  • parcel-data/Parcel2018.csv
  • parcel-data/Parcel2019.csv
  • parcel-data/Parcel2020.csv
  • parcel-data/Parcel2021.csv
  • parcel-data/Parcel2022.csv
  • parcel-data/Parcel2023.csv
  • parcel-data/Parcel2024.csv


In [22]:
# ── 4. Inspect one year to eyeball columns & dtypes ───────────────────────
sample = load_parcel(2014)
display(sample.head())
display(sample.info())

Unnamed: 0,PARCEL ID,AEXMLND,AEXMBLD,AEXMTOT,APPRLND,APPRBLD,APPRTOT,AUDMAP,AUDRTG,LANDUSE,...,NOSTORY,YEARBLT,PROPTYP,WALL,TIFMLND,TIFMBLD,POINT_X,POINT_Y,HOMSTD,BANKCODE
0,000-000000,0.0,0.0,0.0,0.0,0.0,0.0,,,0,...,,0,,,,,,,,
1,010-000000,0.0,0.0,0.0,0.0,0.0,0.0,,,0,...,,0,,,,,,,,
2,010-000001,21100.0,19300.0,40400.0,11300.0,73300.0,84600.0,H001,71.8,520,...,20.0,-1,2.0,2.0,,,,,,
3,010-000002,0.0,0.0,0.0,50900.0,86800.0,137700.0,R044,94.0,510,...,15.0,1910,2.0,1.0,,,,,,
4,010-000003,0.0,0.0,0.0,37800.0,462200.0,500000.0,K037,191.0,330,...,,80,3.0,,,,,,,


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 368052 entries, 0 to 368051
Data columns (total 67 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   PARCEL ID   368052 non-null  object 
 1   AEXMLND     368052 non-null  float64
 2   AEXMBLD     368052 non-null  float64
 3   AEXMTOT     368052 non-null  float64
 4   APPRLND     368052 non-null  float64
 5   APPRBLD     368052 non-null  float64
 6   APPRTOT     368052 non-null  float64
 7   AUDMAP      368052 non-null  object 
 8   AUDRTG      368052 non-null  object 
 9   LANDUSE     368052 non-null  int64  
 10  CAUV        368052 non-null  int64  
 11  SCHOOL      368052 non-null  int64  
 12  MAILAD1     368052 non-null  object 
 13  MAILAD2     368052 non-null  object 
 14  MAILAD3     368052 non-null  object 
 15  MAILAD4     368052 non-null  object 
 16  TRANDT      368052 non-null  object 
 17  TRANYR      0 non-null       float64
 18  NAME1       368052 non-null  object 
 19  NA

None

In [23]:
# ── 5. Compare schemas across all years ───────────────────────────────────
years = range(2014, 2025)
cols_by_year = {yr: set(load_parcel(yr, nrows=0).columns) for yr in years}
common_cols  = reduce(set.intersection, cols_by_year.values())
extras       = {yr: cols_by_year[yr] - common_cols for yr in years}

print(f"\n🟢 Columns present in *every* file ({len(common_cols)}):")
print(sorted(common_cols))

for yr, diff in extras.items():
    if diff:
        print(f"\n🔶 Columns unique to {yr}:")
        print(sorted(diff))


🟢 Columns present in *every* file (67):
['ACREA', 'AEXMBLD', 'AEXMLND', 'AEXMTOT', 'AIRCOND', 'ANN_TAX', 'APPRBLD', 'APPRLND', 'APPRTOT', 'AREA_A', 'AUDMAP', 'AUDRTG', 'BANKCODE', 'BATHS', 'BEDRMS', 'CAUV', 'CINBRHD', 'COND', 'DESCR1', 'DESCR2', 'DESCR3', 'DWELTYP', 'FIREPLC', 'FLOOD', 'GRADE', 'HBATHS', 'HEIGHT', 'HOMSTD', 'LANDUSE', 'MAILAD1', 'MAILAD2', 'MAILAD3', 'MAILAD4', 'NAME1', 'NAME2', 'NAME3', 'NBRHD', 'NOCARDS', 'NOSTORY', 'OWNER_ADD1', 'OWNER_ADD2', 'PARCEL ID', 'PCLASS', 'POINT_X', 'POINT_Y', 'PRICE', 'PROPTYP', 'ROOMS', 'SCHOOL', 'STADDR', 'STATE', 'STCONT', 'STDIRE', 'STHNUM', 'STHSFX', 'STNAME', 'STSFX', 'TAXDESI', 'TIFMBLD', 'TIFMLND', 'TRANDT', 'TRANYR', 'USPS_CITY', 'VALID', 'WALL', 'YEARBLT', 'ZIPCODE']
