In [None]:
#To cluster the location and climate zone
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
import os

# -----------------------------
# 1. Data path
# -----------------------------
base_path = r"D:\000database"

schools_file = os.path.join(base_path, "all_high_schools_with_geocode.csv")
outage_file = os.path.join(base_path, "county_outage_probabilities_2024.csv")
school_count_file = os.path.join(base_path, "high_schools_per_county.csv")

# -----------------------------
# 2. Import data
# -----------------------------
schools = pd.read_csv(schools_file, dtype=str)
outage = pd.read_csv(outage_file, dtype=str)
school_count = pd.read_csv(school_count_file, dtype=str)

# Convert Numeric Columns
for col in ["prob_5h", "prob_24h", "prob_72h"]:
    outage[col] = outage[col].astype(float)

school_count["SchoolCount"] = school_count["SchoolCount"].astype(int)

# Convert Latitude and Longitude
schools["LAT"] = schools["LAT"].astype(float)
schools["LON"] = schools["LON"].astype(float)

# -----------------------------
# 3. Merge data
# -----------------------------
# TODO: 如果 schools 里有 FIPS，直接改成 left_on="FIPS"
schools = schools.merge(outage, left_on="MCITY", right_on="county", how="left")
schools = schools.merge(school_count, left_on="MCITY", right_on="County", how="left")

# -----------------------------
# 4. Climate zone
# -----------------------------
def assign_climate_zone(lat):
    if lat < 33: 
        return "2A"  # Hot
    elif lat < 40: 
        return "3B"  # Mixed
    elif lat < 47: 
        return "5A"  # Cold
    else:
        return "7"   # Very Cold

schools["ClimateZone"] = schools["LAT"].apply(assign_climate_zone)

# one-hot Coded Climate Zone
schools = pd.get_dummies(schools, columns=["ClimateZone"])

# -----------------------------
# 5. Features selected for clustering
# -----------------------------
features = ["prob_5h", "prob_24h", "prob_72h"] + \
           [col for col in schools.columns if col.startswith("ClimateZone_")]

X = schools[features].fillna(0)

# 标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -----------------------------
# 6. (KMeans)
# -----------------------------
kmeans = KMeans(n_clusters=5, random_state=42)
schools["Cluster"] = kmeans.fit_predict(X_scaled)

# -----------------------------
# 7. Resulits
# -----------------------------
cluster_summary = schools.groupby("Cluster")[features].mean()
cluster_counts = schools["Cluster"].value_counts()

print("===== 各 Cluster 特征均值 =====")
print(cluster_summary)
print("\n===== 每个 Cluster 的学校数量 =====")
print(cluster_counts)

# Output
output_file = os.path.join(base_path, "high_schools_clustered.csv")
schools.to_csv(output_file, index=False)
print(f"\nsaved in: {output_file}")