In [None]:
import os
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# -----------------------------
# Parameters
# -----------------------------
years = [2013, 2014, 2015, 2016, 2017, 2019]
input_dir = "data_by_year"
output_dir = "results_by_year"
os.makedirs(output_dir, exist_ok=True)

kmeans_clusters = 3
plot_sample_size = 5000  # number of points to plot (speed)

In [None]:

# -----------------------------
# Main loop
# -----------------------------
for year in years:
    print(f"\nProcessing year {year}...")
    
    # 1️⃣ Load data
    file_path = os.path.join(input_dir, f"merged_data_{year}.csv")
    df = pd.read_csv(file_path, dtype={'CountyFIPS': str})
    
    # 2️⃣ Remove fully empty columns
    df_clean = df.dropna(axis=1, how='all')
    
    # Keep numeric columns
    numeric_cols = df_clean.select_dtypes(include='number').columns
    df_numeric = df_clean[numeric_cols]
    
    # 3️⃣ Impute missing values (median)
    imputer = SimpleImputer(strategy='median')
    X_imputed = imputer.fit_transform(df_numeric)
    
    # 4️⃣ Standardize
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)
    
    # 5️⃣ PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    pca_df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
    pca_df['CountyFIPS'] = df_clean['CountyFIPS'].values
    pca_df['Survey_Year'] = year
    #pca_df.to_csv(os.path.join(output_dir, f"pca_{year}.csv"), index=False)
    
    # 6️⃣ KMeans clustering
    kmeans = KMeans(n_clusters=kmeans_clusters, random_state=42)
    kmeans_labels = kmeans.fit_predict(X_scaled)
    clusters_df = pd.DataFrame({
        'CountyFIPS': df_clean['CountyFIPS'].values,
        'Survey_Year': year,
        'KMeans_Cluster': kmeans_labels
    })
    #clusters_df.to_csv(os.path.join(output_dir, f"clusters_{year}.csv"), index=False)
    
    # 7️⃣ Visualization (downsample for speed)
    plot_df = pca_df.merge(clusters_df, on=['CountyFIPS', 'Survey_Year'])
    plot_df_sample = plot_df.sample(min(len(plot_df), plot_sample_size), random_state=42)
    
    plt.figure(figsize=(8,6))
    sns.scatterplot(data=plot_df_sample, x='PC1', y='PC2', hue='KMeans_Cluster', palette='Set1', s=20)
    plt.title(f"{year} - KMeans Clusters")
    plt.tight_layout()
    #plt.savefig(os.path.join(output_dir, f"clusters_plot_{year}.png"))
    plt.close()
    
    print(f"Year {year} done: PCA + KMeans clusters saved.")