In [1]:
import pandas as pd
import numpy as np

In [6]:
# ================================
# STEP 1 — Load & Basic Sanity Check
# ================================

CSV_PATH = "data/cleaned_data/redfin_cleaned.csv"

# 1) Load
df = pd.read_csv(CSV_PATH)
print(f"Loaded: {len(df):,} rows, {len(df.columns)} columns")

# 2) Normalize column names (lowercase) to avoid KeyError cases
df.columns = [c.lower() for c in df.columns]

# 3) (Optional) keep only zip-level rows if the column exists
if "region_type" in df.columns:
    before = len(df)
    df = df[df["region_type"].str.lower() == "zip code"]
    print(f"Filtered to region_type == 'zip code': {len(df):,} rows (from {before:,})")

# 4) Derive 'year' from period_begin (fallback to period_end)
date_col = "period_begin" if "period_begin" in df.columns else ("period_end" if "period_end" in df.columns else None)
if date_col is None:
    raise ValueError("Expected 'PERIOD_BEGIN' or 'PERIOD_END' to derive year.")

df[date_col] = pd.to_datetime(df[date_col], errors="coerce", utc=True)
df["year"] = df[date_col].dt.year

# 5) Identify key columns
zip_col   = "zip_code" if "zip_code" in df.columns else None
price_col = "median_sale_price" if "median_sale_price" in df.columns else None
assert zip_col is not None and price_col is not None, "Missing 'ZIP_CODE' or 'MEDIAN_SALE_PRICE' in CSV."

# 6) Coerce key fields to numeric & clean
df[zip_col]   = pd.to_numeric(df[zip_col], errors="coerce").astype("Int64")
df["year"]    = pd.to_numeric(df["year"], errors="coerce").astype("Int64")
df[price_col] = pd.to_numeric(df[price_col], errors="coerce")

core = df[[zip_col, "year", price_col]].copy()
before = len(core)
core = core.dropna(subset=[zip_col, "year", price_col])
core = core[core[price_col] > 0]
print(f"Core cleaned rows: {len(core):,} (dropped {before-len(core):,})")

# 7) Quick coverage stats
n_zip = core[zip_col].nunique()
y_min, y_max = int(core["year"].min()), int(core["year"].max())
print({"unique_zips": n_zip, "year_min": y_min, "year_max": y_max})

# 8) Missingness for some useful columns (if present)
maybe_cols = ["median_sale_price_yoy","median_ppsf","homes_sold","inventory","median_dom",
              "avg_sale_to_list","sold_above_list","off_market_in_two_weeks"]
present = [c for c in maybe_cols if c in df.columns]
miss = (df[present].isna().mean()*100).round(1).sort_values(ascending=False) if present else pd.Series(dtype=float)
print("\nMissingness (% of rows) for useful columns:")
display(miss)

# 9) Duplicates check at ZIP × YEAR (helps us plan aggregation)
dup = (core.groupby([zip_col, "year"]).size()
          .rename("rows_per_zip_year")
          .reset_index()
          .sort_values("rows_per_zip_year", ascending=False))
print("\nSample ZIP-year duplicate counts (rows > 1 means multiple records per zip-year):")
display(dup.head(10))

# 10) Peek at a few rows we’ll use downstream
keep_cols = [zip_col, "year", price_col] + [c for c in present if c in df.columns]
print("\nSample rows:")
display(df[keep_cols].head(5))


Loaded: 3,062,620 rows, 39 columns
Filtered to region_type == 'zip code': 3,062,620 rows (from 3,062,620)
Core cleaned rows: 3,062,620 (dropped 0)
{'unique_zips': 24496, 'year_min': 2012, 'year_max': 2025}

Missingness (% of rows) for useful columns:


avg_sale_to_list           4.1
off_market_in_two_weeks    3.0
inventory                  2.4
median_sale_price_yoy      1.4
median_ppsf                1.0
sold_above_list            0.9
homes_sold                 0.0
median_dom                 0.0
dtype: float64


Sample ZIP-year duplicate counts (rows > 1 means multiple records per zip-year):


Unnamed: 0,zip_code,year,rows_per_zip_year
301917,99901,2024,12
301916,99901,2023,12
301915,99901,2022,12
301914,99901,2021,12
301913,99901,2020,12
301912,99901,2019,12
301911,99901,2018,12
301893,99801,2021,12
301892,99801,2020,12
301890,99801,2018,12



Sample rows:


Unnamed: 0,zip_code,year,median_sale_price,median_sale_price_yoy,median_ppsf,homes_sold,inventory,median_dom,avg_sale_to_list,sold_above_list,off_market_in_two_weeks
0,501,2012,180000.0,,,1.0,,39.0,0.8,0.0,
1,501,2012,180000.0,,,1.0,1.0,39.0,0.8,0.0,
2,501,2012,180000.0,,,1.0,1.0,39.0,0.8,0.0,
3,501,2013,255000.0,,,1.0,1.0,19.0,0.944479,0.0,0.0
4,501,2013,255000.0,41.666667,,1.0,1.0,19.0,0.944479,0.0,0.0


Data is clean and we confirmed:

24,496 unique ZIPs,

years span 2012–2025,

but 12 rows per zip-year (monthly/quarterly granularity).

that means before we can engineer features, we need to aggregate ZIP×YEAR into one row each.

In [7]:
# ==========================
# STEP 2 — Aggregate ZIP × YEAR
# ==========================

zip_col = "zip_code"
price_col = "median_sale_price"

# define how each column should aggregate
agg_map = {
    price_col: "mean",                       # average annual price
    "median_sale_price_yoy": "last",         # YoY % change, last period
    "median_ppsf": "mean",
    "homes_sold": "sum",
    "inventory": "mean",
    "median_dom": "mean",
    "avg_sale_to_list": "mean",
    "sold_above_list": "mean",
    "off_market_in_two_weeks": "mean"
}

# filter to only existing columns
agg_map = {k:v for k,v in agg_map.items() if k in df.columns}

zip_year = (
    df.groupby([zip_col, "year"])
      .agg(agg_map)
      .reset_index()
)

print(f"Aggregated shape: {zip_year.shape}")
print(f"Sample ZIP-year rows: {zip_year.head(10)}")

Aggregated shape: (301919, 11)
Sample ZIP-year rows:    zip_code  year  median_sale_price  median_sale_price_yoy  median_ppsf  \
0       501  2012      180000.000000                    NaN          NaN   
1       501  2013      255000.000000              41.666667          NaN   
2       501  2015      239770.000000             -24.691087   195.665236   
3       501  2017      155367.000000             -24.579126    96.024104   
4       501  2018      325000.000000               1.091821          NaN   
5       501  2020      367000.000000              12.923077          NaN   
6       501  2022      485000.000000              32.152589   269.744160   
7       501  2023      565000.000000              16.494845          NaN   
8       501  2025      570000.000000               0.884956   584.016393   
9      1001  2012      157895.833333              -0.154321   110.597204   

   homes_sold  inventory  median_dom  avg_sale_to_list  sold_above_list  \
0         3.0   1.000000   39.00000

Now the data is nicely aggregated to ZIP × YEAR.

In [8]:
# ==========================
# STEP 3 — Engineer Features
# ==========================

def engineer_zip_features(subdf):
    subdf = subdf.sort_values("year")
    
    # latest price & year
    latest_year = subdf["year"].max()
    latest_price = subdf.loc[subdf["year"] == latest_year, "median_sale_price"].values[0]
    
    # CAGR (Compound Annual Growth Rate)
    first_price = subdf["median_sale_price"].iloc[0]
    n_years = subdf["year"].nunique() - 1
    if n_years > 0 and first_price > 0 and latest_price > 0:
        cagr = (latest_price / first_price) ** (1 / n_years) - 1
    else:
        cagr = np.nan
    
    # volatility of YoY price changes
    price_vol = subdf["median_sale_price_yoy"].std(skipna=True)
    
    # turnover: homes_sold ÷ inventory
    if "homes_sold" in subdf and "inventory" in subdf:
        turnover = (subdf["homes_sold"].sum() /
                    (subdf["inventory"].sum() + 1e-6))
    else:
        turnover = np.nan
    
    return pd.Series({
        "latest_year": latest_year,
        "latest_price": latest_price,
        "cagr": cagr * 100 if pd.notna(cagr) else np.nan,   # %
        "price_volatility": price_vol,
        "turnover": turnover,
        "median_dom": subdf["median_dom"].mean(),
        "avg_sale_to_list": subdf["avg_sale_to_list"].mean(),
        "sold_above_list": subdf["sold_above_list"].mean(),
        "off_market_2w": subdf["off_market_in_two_weeks"].mean(),
        "years_covered": subdf["year"].nunique()
    })

zip_features = (
    zip_year.groupby("zip_code")
    .apply(engineer_zip_features)
    .reset_index()
)

print("Engineered shape:", zip_features.shape)
display(zip_features.head(10))


Engineered shape: (24496, 11)


  .apply(engineer_zip_features)


Unnamed: 0,zip_code,latest_year,latest_price,cagr,price_volatility,turnover,median_dom,avg_sale_to_list,sold_above_list,off_market_2w,years_covered
0,501,2025.0,570000.0,15.498221,24.004558,3.333333,59.555556,0.976128,0.444444,0.0,9.0
1,1001,2025.0,335200.0,5.961651,0.130767,9.07563,73.739881,0.988686,0.353539,0.232828,14.0
2,1002,2025.0,459650.0,4.560576,0.159315,7.59341,106.868452,0.985878,0.248882,0.229176,14.0
3,1003,2024.0,962500.0,14.618268,35.244997,3.227586,128.946429,0.972923,0.089286,0.057143,7.0
4,1004,2025.0,645000.0,82.20339,,4000000.0,49.5,1.006814,0.5,0.0,2.0
5,1005,2025.0,455900.0,9.022625,31.053433,7.643899,75.43869,0.988359,0.325645,0.33913,14.0
6,1007,2025.0,452250.0,5.40341,5.090364,6.261407,103.299405,0.990753,0.288283,0.202024,14.0
7,1008,2025.0,330491.8,4.077038,0.588158,4.444666,138.170238,0.955554,0.222462,0.129834,14.0
8,1009,2025.0,247450.0,10.492696,38.169686,5.63173,114.140909,0.992799,0.31131,0.207322,14.0
9,1010,2025.0,575160.0,9.420289,0.389384,6.334356,91.606548,0.979559,0.273568,0.190299,14.0


We now have a ZIP-level feature table with 24,496 rows and 11 engineered features.

**Scoring for First-Time Buyers**

Now let’s build a composite score (ftb_score) to rank ZIP codes for affordability + stability + growth.

we’ll define four sub-scores:

1. Affordability → cheaper ZIPs score higher

2. Growth → higher CAGR scores higher

3. Market favorability → shorter DOM, lower sale-to-list, fewer bidding wars score higher

4. Stability → lower volatility is better

each sub-score is normalized [0–1], then we blend them into a final ftb_score.

In [9]:
# ==========================
# STEP 4 — Scoring ZIPs
# ==========================

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# --- affordability: lower price = better
zip_features["afford_score"] = 1 - scaler.fit_transform(zip_features[["latest_price"]])

# --- growth: higher CAGR = better
zip_features["growth_score"] = scaler.fit_transform(zip_features[["cagr"]].fillna(0))

# --- market favorability
# we combine shorter DOM + lower sale-to-list (buyers not overpaying) + fewer bidding wars
market_vars = zip_features[["median_dom", "avg_sale_to_list", "sold_above_list"]].copy()

# normalize each
for col in market_vars.columns:
    if col == "median_dom":  # lower is better
        zip_features[f"{col}_score"] = 1 - scaler.fit_transform(market_vars[[col]].fillna(market_vars[col].max()))
    else:  # lower sale-to-list and fewer bidding wars = better
        zip_features[f"{col}_score"] = 1 - scaler.fit_transform(market_vars[[col]].fillna(market_vars[col].max()))

# combine into one market score
zip_features["market_score"] = zip_features[["median_dom_score","avg_sale_to_list_score","sold_above_list_score"]].mean(axis=1)

# --- stability: lower volatility is better
zip_features["stability_score"] = 1 - scaler.fit_transform(zip_features[["price_volatility"]].fillna(zip_features["price_volatility"].max()))

# --- final score
zip_features["ftb_score"] = (
    0.35 * zip_features["afford_score"] +
    0.25 * zip_features["growth_score"] +
    0.25 * zip_features["market_score"] +
    0.15 * zip_features["stability_score"]
)

print("Scoring complete. Top ZIPs:")
display(zip_features.sort_values("ftb_score", ascending=False).head(10))


Scoring complete. Top ZIPs:


Unnamed: 0,zip_code,latest_year,latest_price,cagr,price_volatility,turnover,median_dom,avg_sale_to_list,sold_above_list,off_market_2w,years_covered,afford_score,growth_score,median_dom_score,avg_sale_to_list_score,sold_above_list_score,market_score,stability_score,ftb_score
6442,24738,2023.0,237000.0,3102.702703,,6000000.0,97.0,0.983673,0.0,0.0,2.0,0.997303,1.0,0.980645,0.63905,1.0,0.873232,0.0,0.817364
16520,63750,2023.0,310000.0,1027.272727,775.965846,6000000.0,56.0,0.902723,0.0,0.0,2.0,0.996469,0.351373,0.988911,0.69946,1.0,0.896124,0.999738,0.810599
17724,70556,2024.0,140500.0,545.977011,104.363723,5.999997,27.75,0.884783,0.0,0.166667,2.0,0.998406,0.200955,0.994607,0.712849,1.0,0.902485,0.999965,0.775297
18110,72168,2023.0,276250.0,578.5401,7.071068,8.999996,171.416667,0.910082,0.0,0.333333,3.0,0.996854,0.211132,0.965642,0.693968,1.0,0.886537,0.999998,0.773316
4393,16835,2024.0,245000.0,297.573284,395.395294,8000000.0,30.0,0.789502,0.0,0.666667,3.0,0.997211,0.123322,0.994153,0.783954,1.0,0.926036,0.999867,0.761343
9892,36773,2024.0,275000.0,2191.666667,,6000000.0,116.5,0.758486,0.0,0.0,2.0,0.996869,0.715277,0.976714,0.8071,1.0,0.927938,0.0,0.759708
9839,36556,2024.0,180000.0,300.0,227.886227,6000000.0,95.5,0.850902,0.0,0.0,2.0,0.997954,0.124081,0.980948,0.738133,1.0,0.90636,0.999923,0.756883
6543,25214,2022.0,122500.0,265.930571,177.413222,7.499996,68.416667,0.821498,0.0,0.0,4.0,0.998611,0.113433,0.986408,0.760076,1.0,0.915495,0.99994,0.756737
16753,64840,2025.0,210100.0,4.459714,7.665549,10.72608,72.895202,0.5,0.0,0.114213,12.0,0.99761,0.031716,0.985505,1.0,1.0,0.995168,0.999997,0.755884
8623,32654,2025.0,109000.0,176.570476,325.065317,3.857141,95.333333,0.718458,0.0,0.0,3.0,0.998766,0.085506,0.980981,0.836972,1.0,0.939318,0.99989,0.755757


Nice — we’ve got the first ranked list of ZIPs by ftb_score ✅.
But I notice something important:

Some cagr values are crazy high (3102%, 2000%+).

That’s because some ZIPs probably had very few sales or missing data in early years, which makes % change explode.

Similarly, turnover with 6,000,000 looks like a missing-data artifact (probably division by near-zero inventory or homes_sold).

#### Before we go to budget ranking, we should cap winsorize / clean extreme values to make the scores more realistic.

In [11]:
# ==========================
# STEP 5 — Handle Outliers
# ==========================

# Cap CAGR at a reasonable upper bound (e.g., 30% per year for housing)
zip_features["cagr"] = zip_features["cagr"].clip(lower=-0.5, upper=0.30)

# Cap turnover to avoid millions (reasonable: 0 to 2, i.e. 200% turnover)
zip_features["turnover"] = zip_features["turnover"].clip(lower=0, upper=2)

# Cap volatility — high values usually mean noisy data
zip_features["price_volatility"] = zip_features["price_volatility"].clip(upper=1)

print("After capping extremes:")
display(zip_features[["zip_code","latest_price","cagr","turnover","price_volatility"]].sample(10))


After capping extremes:


Unnamed: 0,zip_code,latest_price,cagr,turnover,price_volatility
3848,15241,418870.0,0.3,2.0,1.0
2853,12486,498375.0,0.3,2.0,1.0
23649,97114,502400.0,0.3,2.0,1.0
23269,95608,597900.0,0.3,2.0,1.0
21037,85204,428600.0,0.3,2.0,0.334132
19871,78610,327623.4,0.3,2.0,1.0
16637,64123,223100.0,0.3,2.0,0.474438
17821,70783,462120.6,0.3,2.0,0.50546
6932,27253,290000.0,0.3,2.0,1.0
9900,36830,354461.8,0.3,2.0,1.0


Now the data looks much more realistic.
The crazy 3000% CAGR and 6M turnover are gone, capped to meaningful ranges.

In [12]:
# ==========================
# STEP 6 — Recompute Scores
# ==========================

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

# --- affordability: cheaper ZIPs score higher
zip_features["afford_score"] = 1 - scaler.fit_transform(zip_features[["latest_price"]])

# --- growth: higher CAGR better (now capped)
zip_features["growth_score"] = scaler.fit_transform(zip_features[["cagr"]].fillna(0))

# --- market favorability
market_vars = zip_features[["median_dom", "avg_sale_to_list", "sold_above_list"]].copy()
for col in market_vars.columns:
    if col == "median_dom":
        zip_features[f"{col}_score"] = 1 - scaler.fit_transform(market_vars[[col]].fillna(market_vars[col].max()))
    else:
        zip_features[f"{col}_score"] = 1 - scaler.fit_transform(market_vars[[col]].fillna(market_vars[col].max()))
zip_features["market_score"] = zip_features[["median_dom_score","avg_sale_to_list_score","sold_above_list_score"]].mean(axis=1)

# --- stability: lower volatility better
zip_features["stability_score"] = 1 - scaler.fit_transform(zip_features[["price_volatility"]].fillna(zip_features["price_volatility"].max()))

# --- final blended score
zip_features["ftb_score"] = (
    0.35 * zip_features["afford_score"] +
    0.25 * zip_features["growth_score"] +
    0.25 * zip_features["market_score"] +
    0.15 * zip_features["stability_score"]
)

print("New top ZIPs after cleaning:")
display(zip_features.sort_values("ftb_score", ascending=False).head(10))


New top ZIPs after cleaning:


Unnamed: 0,zip_code,latest_year,latest_price,cagr,price_volatility,turnover,median_dom,avg_sale_to_list,sold_above_list,off_market_2w,years_covered,afford_score,growth_score,median_dom_score,avg_sale_to_list_score,sold_above_list_score,market_score,stability_score,ftb_score
3885,15379,2015.0,35000.0,0.3,0.0,2.0,144.333333,0.568282,0.0,0.0,3.0,0.999611,1.0,0.971102,0.949043,1.0,0.973382,1.0,0.993209
10380,38639,2016.0,30000.0,0.3,0.0,2.0,135.666667,0.845295,0.0,0.0,3.0,0.999669,1.0,0.972849,0.742317,1.0,0.905056,1.0,0.976148
13008,48620,2022.0,106000.0,0.3,0.0,2.0,120.0,0.905925,0.0,0.0,3.0,0.9988,1.0,0.976008,0.697071,1.0,0.891026,1.0,0.972337
3286,13649,2020.0,136000.0,0.3,0.0,2.0,104.0,0.945544,0.0,0.0,3.0,0.998457,1.0,0.979234,0.667505,1.0,0.882246,1.0,0.970022
7344,28353,2025.0,148000.0,0.3,0.0,2.0,315.333333,0.902972,0.0,0.0,3.0,0.99832,1.0,0.936626,0.699275,1.0,0.878634,1.0,0.96907
14906,56631,2023.0,335000.0,0.3,0.0,2.0,221.666667,0.935008,0.0,0.0,3.0,0.996183,1.0,0.955511,0.675367,1.0,0.876959,1.0,0.967904
19500,77453,2025.0,219500.0,0.3,0.0,2.0,14.0,1.0,0.0,0.0,4.0,0.997503,1.0,0.997379,0.626866,1.0,0.874748,1.0,0.967813
15217,59801,2025.0,530580.0,0.3,0.185271,2.0,75.852106,0.6,0.0,0.130338,13.0,0.993948,1.0,0.984909,0.925373,1.0,0.970094,0.814729,0.962614
19304,76905,2024.0,243979.125,0.3,0.057136,2.0,57.173077,0.934568,0.00135,0.093175,13.0,0.997223,1.0,0.988675,0.675696,0.99865,0.887673,0.942864,0.962376
20113,79765,2025.0,316138.0,0.3,0.098496,2.0,61.754167,0.942105,0.000195,0.069791,14.0,0.996398,1.0,0.987751,0.670071,0.999805,0.885876,0.901504,0.955434


Nice! That looks so much cleaner and more stable now.
The top ZIPs after capping are dominated by low-price, low-volatility, consistent markets — exactly what we wanted for the “best affordable” filter.

Now let’s create the three budget buckets ($200k, $350k, $500k) like your earlier tables.


In [13]:
# ==========================
# STEP 7 — Budget Filtering
# ==========================

def top_by_budget(df, budget, top_n=25):
    return (
        df[df["latest_price"] <= budget]
        .sort_values("ftb_score", ascending=False)
        .head(top_n)
        .reset_index(drop=True)
    )

best_200k = top_by_budget(zip_features, 200_000)
best_350k = top_by_budget(zip_features, 350_000)
best_500k = top_by_budget(zip_features, 500_000)

print("=== Cleaned Best Affordable ZIPs — Budget $200,000 ===")
display(best_200k)

print("\n=== Cleaned Best Affordable ZIPs — Budget $350,000 ===")
display(best_350k)

print("\n=== Cleaned Best Affordable ZIPs — Budget $500,000 ===")
display(best_500k)


=== Cleaned Best Affordable ZIPs — Budget $200,000 ===


Unnamed: 0,zip_code,latest_year,latest_price,cagr,price_volatility,turnover,median_dom,avg_sale_to_list,sold_above_list,off_market_2w,years_covered,afford_score,growth_score,median_dom_score,avg_sale_to_list_score,sold_above_list_score,market_score,stability_score,ftb_score
0,15379,2015.0,35000.0,0.3,0.0,2.0,144.333333,0.568282,0.0,0.0,3.0,0.999611,1.0,0.971102,0.949043,1.0,0.973382,1.0,0.993209
1,38639,2016.0,30000.0,0.3,0.0,2.0,135.666667,0.845295,0.0,0.0,3.0,0.999669,1.0,0.972849,0.742317,1.0,0.905056,1.0,0.976148
2,48620,2022.0,106000.0,0.3,0.0,2.0,120.0,0.905925,0.0,0.0,3.0,0.9988,1.0,0.976008,0.697071,1.0,0.891026,1.0,0.972337
3,13649,2020.0,136000.0,0.3,0.0,2.0,104.0,0.945544,0.0,0.0,3.0,0.998457,1.0,0.979234,0.667505,1.0,0.882246,1.0,0.970022
4,28353,2025.0,148000.0,0.3,0.0,2.0,315.333333,0.902972,0.0,0.0,3.0,0.99832,1.0,0.936626,0.699275,1.0,0.878634,1.0,0.96907
5,66606,2025.0,105500.0,0.3,0.133541,2.0,80.251923,0.934897,0.003872,0.145139,13.0,0.998806,1.0,0.984022,0.67545,0.996128,0.8852,0.866459,0.950851
6,15553,2021.0,100000.0,0.3,0.0,2.0,374.6,0.892183,0.3,0.0,2.0,0.998869,1.0,0.924677,0.707326,0.7,0.777335,1.0,0.943938
7,77590,2025.0,167460.0,0.3,0.089408,2.0,61.377435,0.946982,0.161481,0.274296,14.0,0.998098,1.0,0.987827,0.666432,0.838519,0.830926,0.910592,0.943654
8,36330,2025.0,198380.0,0.3,0.092991,2.0,104.85,0.961542,0.136516,0.090974,14.0,0.997744,1.0,0.979062,0.655566,0.863484,0.832704,0.907009,0.943438
9,70563,2025.0,187796.0,0.3,0.122888,2.0,93.530952,0.958942,0.098604,0.112857,14.0,0.997865,1.0,0.981345,0.657506,0.901396,0.846749,0.877112,0.942507



=== Cleaned Best Affordable ZIPs — Budget $350,000 ===


Unnamed: 0,zip_code,latest_year,latest_price,cagr,price_volatility,turnover,median_dom,avg_sale_to_list,sold_above_list,off_market_2w,years_covered,afford_score,growth_score,median_dom_score,avg_sale_to_list_score,sold_above_list_score,market_score,stability_score,ftb_score
0,15379,2015.0,35000.0,0.3,0.0,2.0,144.333333,0.568282,0.0,0.0,3.0,0.999611,1.0,0.971102,0.949043,1.0,0.973382,1.0,0.993209
1,38639,2016.0,30000.0,0.3,0.0,2.0,135.666667,0.845295,0.0,0.0,3.0,0.999669,1.0,0.972849,0.742317,1.0,0.905056,1.0,0.976148
2,48620,2022.0,106000.0,0.3,0.0,2.0,120.0,0.905925,0.0,0.0,3.0,0.9988,1.0,0.976008,0.697071,1.0,0.891026,1.0,0.972337
3,13649,2020.0,136000.0,0.3,0.0,2.0,104.0,0.945544,0.0,0.0,3.0,0.998457,1.0,0.979234,0.667505,1.0,0.882246,1.0,0.970022
4,28353,2025.0,148000.0,0.3,0.0,2.0,315.333333,0.902972,0.0,0.0,3.0,0.99832,1.0,0.936626,0.699275,1.0,0.878634,1.0,0.96907
5,56631,2023.0,335000.0,0.3,0.0,2.0,221.666667,0.935008,0.0,0.0,3.0,0.996183,1.0,0.955511,0.675367,1.0,0.876959,1.0,0.967904
6,77453,2025.0,219500.0,0.3,0.0,2.0,14.0,1.0,0.0,0.0,4.0,0.997503,1.0,0.997379,0.626866,1.0,0.874748,1.0,0.967813
7,76905,2024.0,243979.125,0.3,0.057136,2.0,57.173077,0.934568,0.00135,0.093175,13.0,0.997223,1.0,0.988675,0.675696,0.99865,0.887673,0.942864,0.962376
8,79765,2025.0,316138.0,0.3,0.098496,2.0,61.754167,0.942105,0.000195,0.069791,14.0,0.996398,1.0,0.987751,0.670071,0.999805,0.885876,0.901504,0.955434
9,75707,2025.0,345849.9,0.3,0.113513,2.0,35.975595,0.959051,0.010247,0.145793,14.0,0.996059,1.0,0.992948,0.657425,0.989753,0.880042,0.886487,0.951604



=== Cleaned Best Affordable ZIPs — Budget $500,000 ===


Unnamed: 0,zip_code,latest_year,latest_price,cagr,price_volatility,turnover,median_dom,avg_sale_to_list,sold_above_list,off_market_2w,years_covered,afford_score,growth_score,median_dom_score,avg_sale_to_list_score,sold_above_list_score,market_score,stability_score,ftb_score
0,15379,2015.0,35000.0,0.3,0.0,2.0,144.333333,0.568282,0.0,0.0,3.0,0.999611,1.0,0.971102,0.949043,1.0,0.973382,1.0,0.993209
1,38639,2016.0,30000.0,0.3,0.0,2.0,135.666667,0.845295,0.0,0.0,3.0,0.999669,1.0,0.972849,0.742317,1.0,0.905056,1.0,0.976148
2,48620,2022.0,106000.0,0.3,0.0,2.0,120.0,0.905925,0.0,0.0,3.0,0.9988,1.0,0.976008,0.697071,1.0,0.891026,1.0,0.972337
3,13649,2020.0,136000.0,0.3,0.0,2.0,104.0,0.945544,0.0,0.0,3.0,0.998457,1.0,0.979234,0.667505,1.0,0.882246,1.0,0.970022
4,28353,2025.0,148000.0,0.3,0.0,2.0,315.333333,0.902972,0.0,0.0,3.0,0.99832,1.0,0.936626,0.699275,1.0,0.878634,1.0,0.96907
5,56631,2023.0,335000.0,0.3,0.0,2.0,221.666667,0.935008,0.0,0.0,3.0,0.996183,1.0,0.955511,0.675367,1.0,0.876959,1.0,0.967904
6,77453,2025.0,219500.0,0.3,0.0,2.0,14.0,1.0,0.0,0.0,4.0,0.997503,1.0,0.997379,0.626866,1.0,0.874748,1.0,0.967813
7,76905,2024.0,243979.125,0.3,0.057136,2.0,57.173077,0.934568,0.00135,0.093175,13.0,0.997223,1.0,0.988675,0.675696,0.99865,0.887673,0.942864,0.962376
8,79765,2025.0,316138.0,0.3,0.098496,2.0,61.754167,0.942105,0.000195,0.069791,14.0,0.996398,1.0,0.987751,0.670071,0.999805,0.885876,0.901504,0.955434
9,93634,2018.0,389729.0,0.3,0.103908,2.0,69.0,0.936971,0.0,0.0,3.0,0.995557,1.0,0.98629,0.673902,1.0,0.886731,0.896092,0.954542


Insights:

- 200k bucket → Clean list of ZIPs under 200k, mostly stable and capped, with very high ftb_score values.

- 350k bucket → Expands the list, includes ZIPs up to the mid-300k range, while still keeping good scores.

- 500k bucket → Similar structure but adds higher-price ZIPs (up to ~400k), still scored consistently.

This means your affordability + stability filters are successfully prioritizing steady, entry-level markets.

Right now, the tables only show zip_code. To make them more interpretable, we should merge back the location info (like city, state, metro) from your original Redfin dataset.

In [21]:
# ==========================
# STEP 8 — Geo labels per ZIP (latest by period_end) + merge
# ==========================
import pandas as pd

raw = pd.read_csv("data/cleaned_data/redfin_cleaned.csv")
raw.columns = [c.lower() for c in raw.columns]

# parse period_end to ensure we truly take the latest row per ZIP
raw["period_end"] = pd.to_datetime(raw["period_end"], errors="coerce", utc=True)

# keep only the columns we need for geo context
geo_cols = ["zip_code", "period_end", "state", "state_code", "parent_metro_region"]
geo = raw[geo_cols].dropna(subset=["zip_code"]).sort_values(["zip_code","period_end"])

# latest labels per ZIP (by period_end)
zip_geo_latest = (
    geo.groupby("zip_code", as_index=False, sort=False)
       .tail(1)[["zip_code","state","state_code","parent_metro_region"]]
)

# merge geo into each ranked table
best_200k_geo = best_200k.merge(zip_geo_latest, on="zip_code", how="left")
best_350k_geo = best_350k.merge(zip_geo_latest, on="zip_code", how="left")
best_500k_geo = best_500k.merge(zip_geo_latest, on="zip_code", how="left")

print("=== $200k — with geo ===")
display(best_200k_geo.head(10)[["zip_code","state","state_code","parent_metro_region","latest_price","ftb_score"]])


=== $200k — with geo ===


Unnamed: 0,zip_code,state,state_code,parent_metro_region,latest_price,ftb_score
0,15379,Pennsylvania,PA,"Pittsburgh, PA",35000.0,0.993209
1,38639,Mississippi,MS,"Clarksdale, MS",30000.0,0.976148
2,48620,Michigan,MI,"Midland, MI",106000.0,0.972337
3,13649,New York,NY,"Ogdensburg, NY",136000.0,0.970022
4,28353,North Carolina,NC,"Laurinburg, NC",148000.0,0.96907
5,66606,Kansas,KS,"Topeka, KS",105500.0,0.950851
6,15553,Pennsylvania,PA,"Somerset, PA",100000.0,0.943938
7,77590,Texas,TX,"Houston, TX",167460.0,0.943654
8,36330,Alabama,AL,"Enterprise, AL",198380.0,0.943438
9,70563,Louisiana,LA,"Lafayette, LA",187796.0,0.942507


Geo merge looks perfect. let’s keep rolling.

Now let's add a short “why it ranks” explanation + export CSVs

This makes the tables presentation-ready.

In [25]:
# ==========================
# STEP 9 — Explain + Export
# ==========================

def explain_row(r):
    """Build a simple human-readable explanation for why a ZIP ranks well."""
    bits = []
    # affordability
    if r.get("afford_score", 1) >= 0.8 or r["latest_price"] <= 0.6 * r.get("latest_price", r["latest_price"]):
        bits.append("well under budget")
    elif r.get("afford_score", 1) >= 0.6:
        bits.append("under budget")

    # growth
    if r.get("growth_score", 0) >= 0.75:
        bits.append("strong recent growth")
    elif r.get("growth_score", 0) >= 0.5:
        bits.append("moderate growth")

    # market
    if r.get("market_score", 0) >= 0.6:
        bits.append("healthy market (DOM/sale-to-list)")

    # stability
    if r.get("stability_score", 0) >= 0.6:
        bits.append("relatively stable")

    return ", ".join(bits)


# Export all ranking tables to organized folder
for name, tbl in {
    "best_200k_geo": best_200k_geo,
    "best_350k_geo": best_350k_geo,
    "best_500k_geo": best_500k_geo,
}.items():
    out = tbl.copy()

    # Bring over component scores if missing
    if "afford_score" not in out and "afford_score" in best_200k.columns:
        out = out.merge(
            best_200k[["zip_code", "afford_score", "growth_score", "market_score", "stability_score"]]
            .drop_duplicates("zip_code"),
            on="zip_code", how="left"
        )

    # Add textual explanation column
    out["why_it_ranks"] = out.apply(explain_row, axis=1)

    # Preview
    display(out.head(10))

    # Save in organized folder
    out.to_csv(f"outputs/rankings/{name}.csv", index=False)

print("✅ Rankings exported to outputs/rankings/")


Unnamed: 0,zip_code,latest_year,latest_price,cagr,price_volatility,turnover,median_dom,avg_sale_to_list,sold_above_list,off_market_2w,...,median_dom_score,avg_sale_to_list_score,sold_above_list_score,market_score,stability_score,ftb_score,state,state_code,parent_metro_region,why_it_ranks
0,15379,2015.0,35000.0,0.3,0.0,2.0,144.333333,0.568282,0.0,0.0,...,0.971102,0.949043,1.0,0.973382,1.0,0.993209,Pennsylvania,PA,"Pittsburgh, PA","well under budget, strong recent growth, healt..."
1,38639,2016.0,30000.0,0.3,0.0,2.0,135.666667,0.845295,0.0,0.0,...,0.972849,0.742317,1.0,0.905056,1.0,0.976148,Mississippi,MS,"Clarksdale, MS","well under budget, strong recent growth, healt..."
2,48620,2022.0,106000.0,0.3,0.0,2.0,120.0,0.905925,0.0,0.0,...,0.976008,0.697071,1.0,0.891026,1.0,0.972337,Michigan,MI,"Midland, MI","well under budget, strong recent growth, healt..."
3,13649,2020.0,136000.0,0.3,0.0,2.0,104.0,0.945544,0.0,0.0,...,0.979234,0.667505,1.0,0.882246,1.0,0.970022,New York,NY,"Ogdensburg, NY","well under budget, strong recent growth, healt..."
4,28353,2025.0,148000.0,0.3,0.0,2.0,315.333333,0.902972,0.0,0.0,...,0.936626,0.699275,1.0,0.878634,1.0,0.96907,North Carolina,NC,"Laurinburg, NC","well under budget, strong recent growth, healt..."
5,66606,2025.0,105500.0,0.3,0.133541,2.0,80.251923,0.934897,0.003872,0.145139,...,0.984022,0.67545,0.996128,0.8852,0.866459,0.950851,Kansas,KS,"Topeka, KS","well under budget, strong recent growth, healt..."
6,15553,2021.0,100000.0,0.3,0.0,2.0,374.6,0.892183,0.3,0.0,...,0.924677,0.707326,0.7,0.777335,1.0,0.943938,Pennsylvania,PA,"Somerset, PA","well under budget, strong recent growth, healt..."
7,77590,2025.0,167460.0,0.3,0.089408,2.0,61.377435,0.946982,0.161481,0.274296,...,0.987827,0.666432,0.838519,0.830926,0.910592,0.943654,Texas,TX,"Houston, TX","well under budget, strong recent growth, healt..."
8,36330,2025.0,198380.0,0.3,0.092991,2.0,104.85,0.961542,0.136516,0.090974,...,0.979062,0.655566,0.863484,0.832704,0.907009,0.943438,Alabama,AL,"Enterprise, AL","well under budget, strong recent growth, healt..."
9,70563,2025.0,187796.0,0.3,0.122888,2.0,93.530952,0.958942,0.098604,0.112857,...,0.981345,0.657506,0.901396,0.846749,0.877112,0.942507,Louisiana,LA,"Lafayette, LA","well under budget, strong recent growth, healt..."


Unnamed: 0,zip_code,latest_year,latest_price,cagr,price_volatility,turnover,median_dom,avg_sale_to_list,sold_above_list,off_market_2w,...,median_dom_score,avg_sale_to_list_score,sold_above_list_score,market_score,stability_score,ftb_score,state,state_code,parent_metro_region,why_it_ranks
0,15379,2015.0,35000.0,0.3,0.0,2.0,144.333333,0.568282,0.0,0.0,...,0.971102,0.949043,1.0,0.973382,1.0,0.993209,Pennsylvania,PA,"Pittsburgh, PA","well under budget, strong recent growth, healt..."
1,38639,2016.0,30000.0,0.3,0.0,2.0,135.666667,0.845295,0.0,0.0,...,0.972849,0.742317,1.0,0.905056,1.0,0.976148,Mississippi,MS,"Clarksdale, MS","well under budget, strong recent growth, healt..."
2,48620,2022.0,106000.0,0.3,0.0,2.0,120.0,0.905925,0.0,0.0,...,0.976008,0.697071,1.0,0.891026,1.0,0.972337,Michigan,MI,"Midland, MI","well under budget, strong recent growth, healt..."
3,13649,2020.0,136000.0,0.3,0.0,2.0,104.0,0.945544,0.0,0.0,...,0.979234,0.667505,1.0,0.882246,1.0,0.970022,New York,NY,"Ogdensburg, NY","well under budget, strong recent growth, healt..."
4,28353,2025.0,148000.0,0.3,0.0,2.0,315.333333,0.902972,0.0,0.0,...,0.936626,0.699275,1.0,0.878634,1.0,0.96907,North Carolina,NC,"Laurinburg, NC","well under budget, strong recent growth, healt..."
5,56631,2023.0,335000.0,0.3,0.0,2.0,221.666667,0.935008,0.0,0.0,...,0.955511,0.675367,1.0,0.876959,1.0,0.967904,Minnesota,MN,"Grand Rapids, MN","well under budget, strong recent growth, healt..."
6,77453,2025.0,219500.0,0.3,0.0,2.0,14.0,1.0,0.0,0.0,...,0.997379,0.626866,1.0,0.874748,1.0,0.967813,Texas,TX,"El Campo, TX","well under budget, strong recent growth, healt..."
7,76905,2024.0,243979.125,0.3,0.057136,2.0,57.173077,0.934568,0.00135,0.093175,...,0.988675,0.675696,0.99865,0.887673,0.942864,0.962376,Texas,TX,"San Angelo, TX","well under budget, strong recent growth, healt..."
8,79765,2025.0,316138.0,0.3,0.098496,2.0,61.754167,0.942105,0.000195,0.069791,...,0.987751,0.670071,0.999805,0.885876,0.901504,0.955434,Texas,TX,"Midland, TX","well under budget, strong recent growth, healt..."
9,75707,2025.0,345849.9,0.3,0.113513,2.0,35.975595,0.959051,0.010247,0.145793,...,0.992948,0.657425,0.989753,0.880042,0.886487,0.951604,Texas,TX,"Tyler, TX","well under budget, strong recent growth, healt..."


Unnamed: 0,zip_code,latest_year,latest_price,cagr,price_volatility,turnover,median_dom,avg_sale_to_list,sold_above_list,off_market_2w,...,median_dom_score,avg_sale_to_list_score,sold_above_list_score,market_score,stability_score,ftb_score,state,state_code,parent_metro_region,why_it_ranks
0,15379,2015.0,35000.0,0.3,0.0,2.0,144.333333,0.568282,0.0,0.0,...,0.971102,0.949043,1.0,0.973382,1.0,0.993209,Pennsylvania,PA,"Pittsburgh, PA","well under budget, strong recent growth, healt..."
1,38639,2016.0,30000.0,0.3,0.0,2.0,135.666667,0.845295,0.0,0.0,...,0.972849,0.742317,1.0,0.905056,1.0,0.976148,Mississippi,MS,"Clarksdale, MS","well under budget, strong recent growth, healt..."
2,48620,2022.0,106000.0,0.3,0.0,2.0,120.0,0.905925,0.0,0.0,...,0.976008,0.697071,1.0,0.891026,1.0,0.972337,Michigan,MI,"Midland, MI","well under budget, strong recent growth, healt..."
3,13649,2020.0,136000.0,0.3,0.0,2.0,104.0,0.945544,0.0,0.0,...,0.979234,0.667505,1.0,0.882246,1.0,0.970022,New York,NY,"Ogdensburg, NY","well under budget, strong recent growth, healt..."
4,28353,2025.0,148000.0,0.3,0.0,2.0,315.333333,0.902972,0.0,0.0,...,0.936626,0.699275,1.0,0.878634,1.0,0.96907,North Carolina,NC,"Laurinburg, NC","well under budget, strong recent growth, healt..."
5,56631,2023.0,335000.0,0.3,0.0,2.0,221.666667,0.935008,0.0,0.0,...,0.955511,0.675367,1.0,0.876959,1.0,0.967904,Minnesota,MN,"Grand Rapids, MN","well under budget, strong recent growth, healt..."
6,77453,2025.0,219500.0,0.3,0.0,2.0,14.0,1.0,0.0,0.0,...,0.997379,0.626866,1.0,0.874748,1.0,0.967813,Texas,TX,"El Campo, TX","well under budget, strong recent growth, healt..."
7,76905,2024.0,243979.125,0.3,0.057136,2.0,57.173077,0.934568,0.00135,0.093175,...,0.988675,0.675696,0.99865,0.887673,0.942864,0.962376,Texas,TX,"San Angelo, TX","well under budget, strong recent growth, healt..."
8,79765,2025.0,316138.0,0.3,0.098496,2.0,61.754167,0.942105,0.000195,0.069791,...,0.987751,0.670071,0.999805,0.885876,0.901504,0.955434,Texas,TX,"Midland, TX","well under budget, strong recent growth, healt..."
9,93634,2018.0,389729.0,0.3,0.103908,2.0,69.0,0.936971,0.0,0.0,...,0.98629,0.673902,1.0,0.886731,0.896092,0.954542,California,CA,"Fresno, CA","well under budget, strong recent growth, healt..."


✅ Rankings exported to outputs/rankings/


Here is what we've done in this notebook:

- ✅ Loaded + cleaned raw Redfin ZIP-level data

- ✅ Aggregated to ZIP-year to avoid duplicates

- ✅ Engineered features:

    * Latest price

    * CAGR (growth)

    * Price volatility

    * Market turnover

    * Market health (DOM, sale-to-list, sold_above_list, off_market)

    * Stability (# of years covered)

- ✅ Scored & ranked ZIPs for different affordability tiers ($200k, $350k, $500k)

- ✅ Exported ranked outputs + explanations