In [1]:
# Cell 1: config and imports

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from datetime import datetime

DATA_PATH = "Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv"
TARGET_STATE = "GA"


In [2]:
# Cell 2: helper functions

def get_zhvi_columns(df):
    return df.columns[df.columns.str.match(r'^\d{4}-\d{2}-\d{2}$')]

def compute_yoy(series, year):
    """Compute YoY growth from Dec of previous year to Dec of current year."""
    try:
        prev = series.loc[f"{year-1}-12-31"]
        curr = series.loc[f"{year}-12-31"]
        if pd.notna(prev) and prev != 0:
            return 100 * (curr - prev) / prev
    except:
        return np.nan

def cagr(series):
    if len(series) < 2 or series.iloc[0] <= 0:
        return np.nan
    years = len(series) / 12
    return ((series.iloc[-1] / series.iloc[0]) ** (1 / years) - 1) * 100

def avg_monthly_growth(series):
    monthly_pct = series.pct_change()
    return 100 * monthly_pct.mean()

def volatility(series):
    monthly_pct = series.pct_change()
    return 100 * monthly_pct.std()

def longest_nan_streak(series):
    is_null = series.isnull()
    max_streak = current = 0
    for val in is_null:
        if val:
            current += 1
            max_streak = max(max_streak, current)
        else:
            current = 0
    return max_streak


In [3]:
# Cell 3: load data and clean metadata

df = pd.read_csv(DATA_PATH)
df = df[df["State"] == TARGET_STATE].copy()
zhvi_cols = get_zhvi_columns(df)
print(f"Loaded {len(df)} ZIPs from {TARGET_STATE}")

# Fill missing Metro using City → Metro
ambiguous_cities = {"Boston", "Tifton"}
safe_city_to_metro = (
    df[df["City"].notna() & df["Metro"].notna() & ~df["City"].isin(ambiguous_cities)]
    .groupby("City")["Metro"].agg(lambda x: x.mode().iloc[0]).to_dict()
)
df["Metro"] = df.apply(
    lambda row: safe_city_to_metro.get(row["City"], row["Metro"]) if pd.isna(row["Metro"]) else row["Metro"],
    axis=1
)

# Then County → Metro
county_to_metro = (
    df[df["Metro"].notna()][["CountyName", "Metro"]]
    .drop_duplicates()
    .groupby("CountyName")["Metro"].agg(lambda x: x.mode().iloc[0])
    .to_dict()
)
df["Metro"] = df.apply(
    lambda row: county_to_metro.get(row["CountyName"], row["Metro"]) if pd.isna(row["Metro"]) else row["Metro"],
    axis=1
)

# Drop rows still missing Metro or City
df = df[df["Metro"].notna() & df["City"].notna()].copy()
print(f"Remaining after drop: {len(df)} rows")

# Rename ZIP column for clarity
df.rename(columns={"RegionName": "ZIP"}, inplace=True)
df["ZIP"] = df["ZIP"].astype(str)


Loaded 665 ZIPs from GA
Remaining after drop: 558 rows


In [4]:
# Cell 4: Build rolling rows with per-year features and YoY target

long_rows = []
zhvi_cols = get_zhvi_columns(df)

for _, row in df.iterrows():
    series = row[zhvi_cols].astype(float)
    series.index = pd.to_datetime(series.index)

    # Drop if too few points or missing a lot
    if longest_nan_streak(series) > 12:
        continue

    # Interpolate ZHVI for consistency
    series = series.interpolate(method="linear", limit_direction="both")

    # Build per-year rolling rows (2001–2024)
    for year in range(2001, 2024):  # last usable year = 2023
        try:
            cutoff_date = f"{year}-12-31"
            future_date = f"{year + 1}-12-31"

            if cutoff_date not in series.index or future_date not in series.index:
                continue

            subset = series[series.index <= cutoff_date]
            if len(subset) < 24:
                continue

            final_value = subset.iloc[-1]
            first_value = subset.iloc[0]
            avg_yoy = subset.pct_change(periods=12).mean() * 100
            med_yoy = subset.pct_change(periods=12).median() * 100
            vol = volatility(subset)
            cagr_val = cagr(subset)
            neg_years = sum((subset.pct_change(12) < 0).fillna(False))

            yoy_target = compute_yoy(series, year + 1)
            if pd.isna(yoy_target):
                continue

            long_rows.append({
                "ZIP": row["ZIP"],
                "Year": year,
                "Metro": row["Metro"],
                "CountyName": row["CountyName"],
                "StateSizeRank": row["SizeRank"],
                "FinalZHVI": final_value,
                "CAGR": cagr_val,
                "AvgMonthlyGrowth": avg_monthly_growth(subset),
                "AvgYoYGrowth": avg_yoy,
                "MedianYoYGrowth": med_yoy,
                "Volatility": vol,
                "NegativeGrowthYears": neg_years,
                "YoY_target": yoy_target
            })

        except Exception as e:
            continue

rolling_df = pd.DataFrame(long_rows)
print(f"✅ Created long-format dataset: {rolling_df.shape[0]} rows")
display(rolling_df.head())


✅ Created long-format dataset: 9453 rows


Unnamed: 0,ZIP,Year,Metro,CountyName,StateSizeRank,FinalZHVI,CAGR,AvgMonthlyGrowth,AvgYoYGrowth,MedianYoYGrowth,Volatility,NegativeGrowthYears,YoY_target
0,30044,2001,"Atlanta-Sandy Springs-Alpharetta, GA",Gwinnett County,21,151866.001285,6.028174,0.510406,6.713024,6.906419,0.152679,0,4.382387
1,30044,2002,"Atlanta-Sandy Springs-Alpharetta, GA",Gwinnett County,21,158521.357544,5.476715,0.45818,5.798341,5.344689,0.147217,0,2.113305
2,30044,2003,"Atlanta-Sandy Springs-Alpharetta, GA",Gwinnett County,21,161871.397555,4.625617,0.385736,4.876579,4.941521,0.180493,0,1.930199
3,30044,2004,"Atlanta-Sandy Springs-Alpharetta, GA",Gwinnett County,21,164995.83761,4.08089,0.339717,4.158334,4.319463,0.188464,0,2.572523
4,30044,2005,"Atlanta-Sandy Springs-Alpharetta, GA",Gwinnett County,21,169240.393564,3.827964,0.318115,3.755184,2.913979,0.179771,0,2.979661


In [5]:
# Cell 5: merge in ACS ZIP-level features
import requests

API_KEY = "b06c64cd8eadc4fa292ce05e788144d98223fa3f"
acs_vars = [
    "B25003_001E", "B25003_003E", "B08201_001E", "B08201_002E",
    "B15003_001E", "B15003_022E", "B15003_023E", "B15003_024E", "B15003_025E",
    "B23025_003E", "B23025_005E", "B17001_001E", "B17001_002E", "B01003_001E",
    "B01001_003E", "B01001_027E", "B01001_004E", "B01001_028E", "B01001_005E", "B01001_029E",
    "B01001_006E", "B01001_030E", "B01001_007E", "B01001_031E", "B01001_008E", "B01001_032E",
    "B01001_009E", "B01001_033E", "B01001_010E", "B01001_034E", "B01001_020E", "B01001_044E",
    "B11001_001E", "B11001_008E", "B11016_001E", "B11016_010E"
]
url = f"https://api.census.gov/data/2023/acs/acs5?get=NAME,{','.join(acs_vars)}&for=zip%20code%20tabulation%20area:*&key={API_KEY}"
acs_data = requests.get(url).json()
columns = acs_data[0]
rows = acs_data[1:]
acs_df = pd.DataFrame(rows, columns=columns)

acs_df.rename(columns={"zip code tabulation area": "ZIP"}, inplace=True)
acs_df["ZIP"] = acs_df["ZIP"].astype(str)
for col in acs_df.columns:
    if col not in ["ZIP", "NAME"]:
        acs_df[col] = pd.to_numeric(acs_df[col], errors="coerce")

# engineer features
acs_df["Pct_Renter_Occupied"] = 100 * acs_df["B25003_003E"] / acs_df["B25003_001E"]
acs_df["Pct_Bachelors_Or_Higher"] = 100 * (
    acs_df["B15003_022E"] + acs_df["B15003_023E"] + acs_df["B15003_024E"] + acs_df["B15003_025E"]
) / acs_df["B15003_001E"]
acs_df["Unemployment_Rate"] = 100 * acs_df["B23025_005E"] / acs_df["B23025_003E"]
acs_df["Pct_Below_Poverty"] = 100 * acs_df["B17001_002E"] / acs_df["B17001_001E"]
acs_df["Pct_No_Vehicle"] = 100 * acs_df["B08201_002E"] / acs_df["B08201_001E"]
acs_df["Pct_One_Person_HH"] = 100 * acs_df["B11001_008E"] / acs_df["B11001_001E"]
acs_df["Pct_4plus_HH"] = 100 * acs_df["B11016_010E"] / acs_df["B11016_001E"]
acs_df["Pct_Age_0_17"] = 100 * (
    acs_df["B01001_003E"] + acs_df["B01001_027E"] + acs_df["B01001_004E"] + acs_df["B01001_028E"] +
    acs_df["B01001_005E"] + acs_df["B01001_029E"] + acs_df["B01001_006E"] + acs_df["B01001_030E"] +
    acs_df["B01001_007E"] + acs_df["B01001_031E"]
) / acs_df["B01003_001E"]
acs_df["Pct_Age_18_34"] = 100 * (
    acs_df["B01001_008E"] + acs_df["B01001_032E"] + acs_df["B01001_009E"] + acs_df["B01001_033E"] +
    acs_df["B01001_010E"] + acs_df["B01001_034E"]
) / acs_df["B01003_001E"]
acs_df["Pct_Age_65plus"] = 100 * (
    acs_df["B01001_020E"] + acs_df["B01001_044E"]
) / acs_df["B01003_001E"]

# select subset and merge
final_acs = acs_df[["ZIP", "Pct_Renter_Occupied", "Pct_Bachelors_Or_Higher", "Unemployment_Rate",
                    "Pct_Below_Poverty", "Pct_No_Vehicle", "Pct_One_Person_HH", "Pct_4plus_HH",
                    "Pct_Age_0_17", "Pct_Age_18_34", "Pct_Age_65plus"]]

rolling_df = rolling_df.merge(final_acs, on="ZIP", how="left")
print(f"✅ Final merged dataset: {rolling_df.shape}")


✅ Final merged dataset: (9453, 23)


In [6]:
# Cell 6: finalize and export dataset with one-hot encoding

# remove rows with missing target
rolling_df = rolling_df[rolling_df["YoY_target"].notna()].copy()

# group rare Metro and CountyName
def group_rare(series, min_count=2):
    counts = series.value_counts()
    return series.apply(lambda x: x if counts[x] >= min_count else "Other")

rolling_df["Metro"] = group_rare(rolling_df["Metro"])
rolling_df["CountyName"] = group_rare(rolling_df["CountyName"])

# one-hot encode categorical columns
rolling_df = pd.get_dummies(rolling_df, columns=["Metro", "CountyName"], drop_first=True)

# drop NA and reset index
rolling_df = rolling_df.dropna().reset_index(drop=True)

# export to CSV
rolling_df.to_csv("final_processed_dataset.csv", index=False)
print(f"📦 Exported cleaned dataset with {rolling_df.shape[0]} rows and {rolling_df.shape[1]} columns.")
display(rolling_df.head())


📦 Exported cleaned dataset with 9453 rows and 159 columns.


Unnamed: 0,ZIP,Year,StateSizeRank,FinalZHVI,CAGR,AvgMonthlyGrowth,AvgYoYGrowth,MedianYoYGrowth,Volatility,NegativeGrowthYears,...,CountyName_Upson County,CountyName_Walker County,CountyName_Walton County,CountyName_Ware County,CountyName_Washington County,CountyName_Wayne County,CountyName_Whitfield County,CountyName_Wilkes County,CountyName_Wilkinson County,CountyName_Worth County
0,30044,2001,21,151866.001285,6.028174,0.510406,6.713024,6.906419,0.152679,0,...,False,False,False,False,False,False,False,False,False,False
1,30044,2002,21,158521.357544,5.476715,0.45818,5.798341,5.344689,0.147217,0,...,False,False,False,False,False,False,False,False,False,False
2,30044,2003,21,161871.397555,4.625617,0.385736,4.876579,4.941521,0.180493,0,...,False,False,False,False,False,False,False,False,False,False
3,30044,2004,21,164995.83761,4.08089,0.339717,4.158334,4.319463,0.188464,0,...,False,False,False,False,False,False,False,False,False,False
4,30044,2005,21,169240.393564,3.827964,0.318115,3.755184,2.913979,0.179771,0,...,False,False,False,False,False,False,False,False,False,False
