In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
df1 = pd.read_csv('../Data/public_fac.csv')
df2 = pd.read_csv('../Data/prop_values.csv')

In [None]:
df1['score'] = 1
df_p = df1.pivot_table(index = 'zipcode', columns = 'facgroup', aggfunc='sum', values = 'score')
df_p = df_p.fillna(0)

In [None]:
df2_g = df2[['zip', 'revised_market_value']].groupby(by = 'zip').mean()

In [None]:
df_p['zip'] = df_p.index
df_m = df_p.merge(df2_g, left_on='zip', right_on='zip')
df_m = df_m[~(df_m['zip']==11201.0)]

In [None]:
to_scale = df_m.columns[:-1].to_list()
to_scale.append('revised_market_value')

In [None]:
scaler = StandardScaler()
df_s = pd.DataFrame(scaler.fit_transform(df_m[to_scale]), columns = to_scale)

In [None]:
wcss = []
n_max = 15
for k in range(1, n_max):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(df_s[to_scale])
    wcss.append(kmeans.inertia_)
    
plt.figure(figsize=(8, 5))
plt.plot(range(1,n_max), wcss, marker="o", linestyle="-")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Within-Cluster Sum of Squares (WCSS)")
plt.title("Elbow Method for Optimal K")
plt.grid(True)
plt.show()

In [None]:
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df_s["Cluster"] = kmeans.fit_predict(df_s[to_scale])
df_m["Cluster"] = kmeans.fit_predict(df_s[to_scale])

In [None]:
import seaborn as sns

pca = PCA(2)
pca_components = pca.fit_transform(df_s[to_scale])
df_s["PCA1"], df_s["PCA2"] = pca_components[:, 0], pca_components[:, 1]

pca_loadings = pd.DataFrame(
        pca.components_,
        columns=to_scale,
        index=["PCA1", "PCA2"]
    )

plt.figure(figsize=(8,6))
sns.scatterplot(x=df_s["PCA1"], y=df_s["PCA2"], hue=df_s["Cluster"], palette="viridis", alpha=0.7)
plt.title("Cluster Visualization using PCA")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.legend(title="Cluster")
plt.grid(True)
plt.show()

In [None]:
c_avg = df_m.groupby('Cluster').mean()
c_diff = (c_avg - df_m.mean())/df_m.std()
c_d = c_diff.copy()
c_diff.drop(['zip', 'revised_market_value'], axis = 1, inplace = True)

for c in range(optimal_k):
    print('Cluster: ', c, ' Number of zip codes: ', df_m.groupby('Cluster').count().iloc[c, 0])
    print('z score of housing value:', c_d.iloc[c, 26] )
    print(c_diff.T.sort_values(by = c, ascending = False).iloc[0:5, c])
    print('\n')
    print(c_diff.T.sort_values(by = c, ascending = True).iloc[0:5, c])
    print('\n')

In [None]:
import plotly.express as px
from urllib.request import urlopen
import json
#UNCOMMENT TO SEE PLOT
'''
with open('../data/ny_new_york_zip_codes_geo.min.json', 'rb') as f:
    nyc_zip = json.load(f)
'''

In [None]:
'''
fig = px.choropleth_map(
    df_m,
    geojson=nyc_zip,
    featureidkey='properties.ZCTA5CE10',
    locations='zip',
    color='Cluster',
    map_style='outdoors',
    center={'lat': 40.71, "lon": -74.01},
    zoom = 8
)
fig.show()
'''