In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import os

# Scale Data and Remove Outliers

In [None]:
df = pd.read_csv('spatialjoined70_v2.csv', index_col=0).rename(columns={'POP': 'POPULATION'})

#### REMOVE OUTLIERS ####
from scipy import stats

value_columns = ['SURFACE', 'HEIGHT', 'POPULATION', 'VOLUME', 'LAND']
z_scores = np.abs(stats.zscore(df_clean[value_columns]))
df_clean = df_clean[(z_scores < 3).all(axis=1)]

from sklearn.preprocessing import MinMaxScaler

df_scaled = df_clean.copy()
scaler = MinMaxScaler()
df_scaled[value_columns] = scaler.fit_transform(df_scaled[value_columns])

## Display Data (Violin Plot)

In [None]:
# melt for seaborn plots
df_melted = df_scaled.melt(value_vars=value_columns, var_name='Category', value_name='Scaled Value')

# Create the violin plot
plt.figure(figsize=(11, 8))
sns.violinplot(x='Category', y='Scaled Value', data=df_melted, palette='Set2', hue='Category', legend=False)
plt.xlabel('GHS Category', fontsize=12)
plt.ylabel('Scaled GHS Value', fontsize=12)
plt.title('Distribution of Scaled GHS Values by GHS Category', fontsize=14)

plt.tight_layout()
plt.show()

# Run K-Means for k=1,...,7 and Calculate Elbow Method and Silhouette Scores

In [None]:
value_columns = ['SURFACE', 'HEIGHT', 'POPULATION', 'VOLUME', 'LAND']
sample_df = df_scaled.copy()
X = sample_df[value_columns]

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import warnings

warnings.filterwarnings("ignore", message="KMeans is known to have a memory leak")

# Elbow Method and Silhouette Scores
inertia = []
silhouette_scores = []
kmeans_models = []

for n in range(1, 8):
    kmeans = KMeans(n_clusters=n, init='k-means++', n_init=10, max_iter=100, random_state=42)
    kmeans.fit(X)
    # store model for later use
    kmeans_models.append(kmeans)
    # calculate inertia for k-means
    inertia.append(kmeans.inertia_)
    if n == 1:
        silhouette_scores.append(0)
    else:
        # calculate silhouette score
        score = silhouette_score(X, kmeans.labels_)
        silhouette_scores.append(score)

fig, ax1 = plt.subplots(figsize=(8, 4))

# Plot for Elbow Method
c1 = 'tab:blue'
ax1.plot(range(1, 8), inertia, label='Inertia', color=c1)
ax1.set_xlabel('Number of clusters')
ax1.set_ylabel('Inertia', color=c1)
ax1.tick_params(axis='y', labelcolor=c1)

# Plot for Silhouette Score
c2 = 'tab:red'
ax2 = ax1.twinx()
ax2.plot(range(2, 8), silhouette_scores[1:], label='Silhouette score', color=c2)
ax2.set_ylabel('Silhouette score', color=c2)
ax2.tick_params(axis='y', labelcolor=c2)

ax1.grid(False)
ax2.grid(False)

fig.legend(loc='upper right', bbox_to_anchor=(0.905, 0.89))
plt.title('Elbow Method and Silhouette Scores for K-Means Clustering')
plt.show()

In [None]:
from yellowbrick.cluster import SilhouetteVisualizer

fig, ax = plt.subplots(3, 2, figsize=(13, 10))

warnings.filterwarnings("ignore", message="X does not have valid feature names")

for i in range(2, 8):
    # reuse previously fitted model
    km = kmeans_models[i-1]
    q, mod = divmod(i-2, 2)
    
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q][mod])
    visualizer.fit(X)
    
    ax[q][mod].set_title(f'k = {i}')
    ax[q][mod].set_xlabel('Silhouette Score')
    ax[q][mod].set_ylabel('Observations')

plt.tight_layout()
plt.show()

# Visualize Clustering

In [None]:
cluster_colors = ['chocolate', 'darkcyan', 'darkmagenta', 'firebrick', 'olivedrab']

## Distributions

In [None]:
# select model with desired k
sample_df['cluster'] = kmeans_models[0].fit_predict(X)
sample_df['cluster'] = sample_df['cluster'] + 1 # add 1 so cluster labels start from 1 not 0

fig, axes = plt.subplots(1, 3, figsize=(12, 5))

# all
total_clusters = sample_df['cluster'].value_counts().sort_index()
total_clusters.plot(kind='bar', ax=axes[0], title='Total Cluster Distribution', color=cluster_colors)
axes[0].set_xlabel('Cluster')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(total_clusters.index, rotation=0)  # clusters are labeled as 0, 1, 2

# unlabeled
samp_unlabeled = sample_df[sample_df['label'] == 'Unlabeled']
unlabeled_clusters = samp_unlabeled['cluster'].value_counts().sort_index()
unlabeled_clusters.plot(kind='bar', ax=axes[1], title='Unlabeled Cluster Distribution', color=cluster_colors)
axes[1].set_xlabel('Cluster')
axes[1].set_ylabel('Count')
axes[1].set_xticklabels(unlabeled_clusters.index, rotation=0)  # Ensures clusters are labeled as 0, 1, 2

# labeled
samp_labeled = sample_df[sample_df['label'] == 'Labeled']
labeled_clusters = samp_labeled['cluster'].value_counts().sort_index()
labeled_clusters.plot(kind='bar', ax=axes[2], title='Labeled Cluster Distribution', color=cluster_colors)
axes[2].set_xlabel('Cluster')
axes[2].set_ylabel('Count')
axes[2].set_xticklabels(labeled_clusters.index, rotation=0)  # label in order of 0, 1, 2

plt.tight_layout()
plt.show()

## Pairplot

In [None]:
# cluster colors
pairplot = sns.pairplot(sample_df, hue='cluster', palette=cluster_colors, vars=['SURFACE', 'HEIGHT', 'POPULATION', 'LAND', 'VOLUME'])
pairplot.fig.suptitle('Cluster Pairplot for k=5', y=1.03, x=0.54)

# Remove the grid lines
for ax in pairplot.axes.flatten():
    ax.grid(False)

plt.tight_layout()
plt.show()

## PCA Viz in 3d

In [None]:
from mpl_toolkits.mplot3d import Axes3D

# Perform PCA to reduce to 3 dimensions
pca_3 = PCA(n_components=3)
principal_components_3 = pca_3.fit_transform(sample_df[value_columns])

# Create a new DataFrame with the PCA results and the cluster labels
pca3_df = pd.DataFrame(data=principal_components_3, columns=['PC1', 'PC2', 'PC3'])
pca3_df['cluster'] = sample_df['cluster'].values

# map cluster labels to the custom colors (adjusting for 1-based indexing)
colors = [cluster_colors[label - 1] for label in pca_df['cluster']]

warnings.filterwarnings("ignore", message="Collection without array used. Make sure to ")

# Create a 3D scatter plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(pca3_df['PC1'], pca3_df['PC2'], pca3_df['PC3'], c=colors, s=10, alpha=0.3)

# Create a manual legend
legend_labels = sample_df['cluster'].unique()
legend_colors = [cluster_colors[label - 1] for label in legend_labels]
patches = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10, label=f'Cluster {label}')
           for label, color in zip(legend_labels, legend_colors)]
plt.legend(handles=patches, title="Clusters")

ax.set_xlabel('Principal Component 1')
ax.set_ylabel('Principal Component 2')
ax.set_zlabel('Principal Component 3')
ax.dist = 11
plt.title('PCA of Clusters in 3D')
plt.tight_layout()
plt.show()

# Map Plot

In [None]:
import plotly.express as px

def cluster_geo_scatter(df):
    df['cluster'] = df['cluster'].astype(str)
    fig = px.scatter_mapbox(df, lat="lat", lon="lon",
                        color_discrete_map={"1": "chocolate", "2": "darkcyan", "3": "darkmagenta", "4": "firebrick"},
                        color="cluster", zoom=4, mapbox_style='open-street-map', opacity=0.6)
    fig.update_layout(width=1000, height=1000)
    return fig

cluster_geo_scatter(sample_df)