In [11]:
"""
This is a notebook to walkthrough the process of clustering the combined data.
"""

import pandas as pd
import umap
import matplotlib.pyplot as plt
import hdbscan
import numpy
import geopandas as gpd
import numpy as np

from sklearn.preprocessing import StandardScaler

In [8]:
# Import data from previous notebook
df = pd.read_pickle("data/merged_data_2022-07-08_to_2025-07-07.pkl")
print(df.head())

  dispatch_date       dispatch_time            address_block        lat  \
0    2025-05-02 2025-07-08 23:44:00      1100 BLOCK S 4TH ST  39.934248   
2    2025-04-23 2025-07-08 18:32:00     1700 BLOCK N 32ND ST  39.983588   
3    2025-04-24 2025-07-08 00:47:00  1300 BLOCK W Venango St  40.007425   
4    2025-02-26 2025-07-08 20:16:00      400 BLOCK N 35TH ST  39.961642   
5    2025-04-13 2025-07-08 03:15:00      5100 BLOCK N 5TH ST  40.029310   

         lon  district_01  district_02  district_03  district_05  district_06  \
0 -75.150833        False        False         True        False        False   
2 -75.186199        False        False        False        False        False   
3 -75.149843        False        False        False        False        False   
4 -75.192723        False        False        False        False        False   
5 -75.132056        False        False        False        False        False   

   ...  max_temp_f  min_temp_f  pop_total  income_median  medi

The first significant step is to scale our data, as clustering is based on distance. This is to avoid
features that are naturally greater in range, take higher importance/precedence when clustered.

In [None]:
# Initialize StandardScaler and config it to output a DataFrame
scaler = StandardScaler()
scaler.set_output(transform='pandas')

# Extract the columns to scale
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()
# NOTE: I do not want to scale the sinusoidal features as they are already in the desired range
cols_to_scale = [
    col for col in numeric_cols
    if '_sin' not in col and '_cos' not in col
]

# Fit and transform the data
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])

# Print the scaled data as sanity check
print(df.head())

  dispatch_date       dispatch_time            address_block       lat  \
0    2025-05-02 2025-07-08 23:44:00      1100 BLOCK S 4TH ST -1.407757   
2    2025-04-23 2025-07-08 18:32:00     1700 BLOCK N 32ND ST -0.323392   
3    2025-04-24 2025-07-08 00:47:00  1300 BLOCK W Venango St  0.200477   
4    2025-02-26 2025-07-08 20:16:00      400 BLOCK N 35TH ST -0.805715   
5    2025-04-13 2025-07-08 03:15:00      5100 BLOCK N 5TH ST  0.681454   

        lon  district_01  district_02  district_03  district_05  district_06  \
0 -0.066541        False        False         True        False        False   
2 -0.628440        False        False        False        False        False   
3 -0.050807        False        False        False        False        False   
4 -0.732081        False        False        False        False        False   
5  0.231798        False        False        False        False        False   

   ...  max_temp_f  min_temp_f  pop_total  income_median  median_age  \
0 

Now, the data is ready for applying UMAP. This is a non-linear dimensionality
reduction algorithm, based on topology (allowing it to capture those non-linear patterns). This is
used, as clustering our data with over 80 columns can easily lead to the curse of dimensionality,
leading to much higher computational time and overfitting.