In [4]:
import pandas as pd
import numpy as np
DATA_DIR = "../data"   # adjust if needed

In [5]:

by_zip_year = pd.read_csv(f"{DATA_DIR}/registration_ny_by_zip_year.csv")

# Sort by ZIP and year
by_zip_year = by_zip_year.sort_values(["ZIP", "year"]).reset_index(drop=True)
# print distinct value of zip
print(by_zip_year["ZIP"].nunique())

# Previous year's vehicle_count per ZIP
by_zip_year["prev_vehicle_count"] = (
    by_zip_year.groupby("ZIP")["vehicle_count"].shift(1)
)

# We'll use prev_vehicle_count as the "base EV stock" for weighting later
by_zip_year["base_ev"] = by_zip_year["prev_vehicle_count"]

# Option 1: treat very small bases as too noisy and drop them
# (e.g., when EVs are just starting, growth can be crazy)
min_base = 0
by_zip_year.loc[by_zip_year["base_ev"] < min_base, "base_ev"] = np.nan

# Safe denominator: if base_ev is NaN or 0, growth_rate will be NaN
denom = by_zip_year["base_ev"].replace({0: np.nan})

by_zip_year["growth_rate"] = (
    (by_zip_year["vehicle_count"] - by_zip_year["prev_vehicle_count"]) / denom
)

# We only keep rows where we can compute a meaningful growth rate, and print number of rows dropped
growth_panel = by_zip_year.dropna(subset=["growth_rate", "base_ev"]).copy()
rows_dropped = len(by_zip_year) - len(growth_panel)
print(f"Rows dropped due to insufficient data: {rows_dropped}")

print("Total rows in panel:", len(by_zip_year))
print("Rows with valid growth_rate:", len(growth_panel))
print("Distinct ZIP codes in panel:", by_zip_year["ZIP"].nunique())
print("Distinct ZIP codes with valid growth_rate:", growth_panel["ZIP"].nunique())

growth_panel

1761
Rows dropped due to insufficient data: 1761
Total rows in panel: 14383
Rows with valid growth_rate: 12622
Distinct ZIP codes in panel: 1761
Distinct ZIP codes with valid growth_rate: 1663


Unnamed: 0,ZIP,year,vehicle_count,lat,lon,prev_vehicle_count,base_ev,growth_rate
1,10001,2014,102,40.7484,-73.9967,92.0,92.0,0.108696
2,10001,2015,69,40.7484,-73.9967,102.0,102.0,-0.323529
3,10001,2016,23,40.7484,-73.9967,69.0,69.0,-0.666667
4,10001,2017,25,40.7484,-73.9967,23.0,23.0,0.086957
5,10001,2018,31,40.7484,-73.9967,25.0,25.0,0.240000
...,...,...,...,...,...,...,...,...
14378,14905,2018,8,42.0869,-76.8397,7.0,7.0,0.142857
14379,14905,2019,17,42.0869,-76.8397,8.0,8.0,1.125000
14380,14905,2020,12,42.0869,-76.8397,17.0,17.0,-0.294118
14381,14905,2021,32,42.0869,-76.8397,12.0,12.0,1.666667


# Growth rate global clustering

##### KMeans

In [6]:
# from sklearn.cluster import KMeans

# # Keep only rows with valid growth & base_ev
# gp = growth_panel.dropna(subset=["growth_rate", "base_ev"]).copy()

# # ---- Trim extreme growth values ----
# # Reasonable bounds: >= -0.9 (cannot lose more than 90%) and <= 50 (5000% growth)
# lower_bound = -0.9
# upper_bound = 50.0
# gp = gp[(gp["growth_rate"] >= lower_bound) & (gp["growth_rate"] <= upper_bound)].copy()

# print("Rows after trimming extremes:", len(gp))

# # ------------- k-means clustering on trimmed growth_rate -------------
# X = gp["growth_rate"].values.reshape(-1, 1)
# k = 3  # slow / medium / fast
# kmeans = KMeans(n_clusters=k, random_state=0, n_init=10)
# gp["cluster"] = kmeans.fit_predict(X)

# # ------------- cluster-level stats & EV-weighted weights -------------
# cluster_stats = (
#     gp.groupby("cluster", as_index=False)
#       .agg(
#           avg_growth=("growth_rate", "mean"),
#           num_obs=("growth_rate", "size"),
#           total_base_ev=("base_ev", "sum"),
#       )
# )

# cluster_stats["weight"] = (
#     cluster_stats["total_base_ev"] / cluster_stats["total_base_ev"].sum()
# )

# print("Cluster stats (raw after trimming):")
# print(cluster_stats)

# # ------------- order clusters by avg_growth and relabel as scenario -------------
# cluster_stats = cluster_stats.sort_values("avg_growth").reset_index(drop=True)
# cluster_stats["scenario"] = cluster_stats.index + 1  # 1=slow, 2=medium, 3=fast

# cluster_to_scenario = dict(zip(cluster_stats["cluster"], cluster_stats["scenario"]))
# gp["scenario"] = gp["cluster"].map(cluster_to_scenario)

# print("\nCluster stats ordered by growth:")
# print(cluster_stats[["scenario", "cluster", "avg_growth", "weight", "num_obs"]])

# # ------------- merge scenario back to full growth_panel -------------
# growth_panel = growth_panel.merge(
#     gp[["ZIP", "year", "growth_rate", "scenario"]],
#     on=["ZIP", "year", "growth_rate"],
#     how="left",
# )

# cluster_stats.to_csv("scenario_growth_weights_trimmed.csv", index=False)
# growth_panel.to_csv("registration_ny_growth_with_scenarios_trimmed.csv", index=False)

# print("\nSaved:")
# print("- scenario_growth_weights_trimmed.csv")
# print("- registration_ny_growth_with_scenarios_trimmed.csv")


##### GMM

In [9]:
from sklearn.mixture import GaussianMixture

# --- Keep only valid + trimmed growth values ---
gp = growth_panel.dropna(subset=["growth_rate", "base_ev"]).copy()

# Trim extremes
lower_bound = 0.1
upper_bound = 0.6
gp = gp[(gp["growth_rate"] >= lower_bound) & (gp["growth_rate"] <= upper_bound)].copy()

print("Rows after trimming extremes:", len(gp))

# --- Prepare data for GMM ---
X = gp["growth_rate"].values.reshape(-1, 1)

# --- Fit GMM (3 scenarios) ---
gmm = GaussianMixture(n_components=3, random_state=0)
gp["gmm_cluster"] = gmm.fit_predict(X)

# Extract scenario-level parameters
means = gmm.means_.flatten()
weights = gmm.weights_

cluster_stats = pd.DataFrame({
    "gmm_cluster": np.arange(3),
    "avg_growth": means,
    "weight": weights
})

# --- Order by avg_growth & relabel as 1,2,3 ---
cluster_stats = cluster_stats.sort_values("avg_growth").reset_index(drop=True)
cluster_stats["scenario"] = cluster_stats.index + 1

mapping = dict(zip(cluster_stats["gmm_cluster"], cluster_stats["scenario"]))
gp["scenario"] = gp["gmm_cluster"].map(mapping)

print("\nScenario stats:")
print(cluster_stats[["scenario", "avg_growth", "weight"]])

# --- Merge scenario labels back to full growth_panel ---
growth_panel = growth_panel.merge(
    gp[["ZIP", "year", "growth_rate", "scenario"]],
    on=["ZIP", "year", "growth_rate"],
    how="left"
)

cluster_stats.to_csv(f"{DATA_DIR}/scenario_growth_weights.csv", index=False)
growth_panel.to_csv(f"{DATA_DIR}/registration_ny_growth_with_scenarios.csv", index=False)

print("\nSaved:")
print("- scenario_growth_weights.csv")
print("- registration_ny_growth_with_scenarios.csv")


Rows after trimming extremes: 3025

Scenario stats:
   scenario  avg_growth    weight
0         1    0.186137  0.306058
1         2    0.336315  0.311044
2         3    0.506812  0.382898

Saved:
- scenario_growth_weights.csv
- registration_ny_growth_with_scenarios.csv
