In [1]:
"""
This notebook takes the clustering result from the previous notebook, and visualize the clusters,
alongside performing spatial hotspot analysis.
"""

import folium
import geopandas as gpd
import matplotlib.colors
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from folium.plugins import MarkerCluster


In [2]:
# Load the experimental data generated from previous notebooks
df_clustered = pd.read_pickle("experimental_data/labeled_merged_data_2022-07-21_to_2025-07-20.pkl")
recent_crime = pd.read_pickle("experimental_data/merged_data_2025-07-18.pkl")

Now that the data has been clustered, I want to make a map to visualize all the hard work and 
analysis done. This will involve three parts:
1. Plotting the clusters from the previous notebook, where I performed UMAP/HDBSCAN clustering.
2. Overlaying recent crimes
3. Performing some hotspot analysis on the clustered data (without cluster labels).

Effectively, I want each to act as a separate layer, and combined, will provide a comprehensive 
analysis of the crime distribution in Philadelphia (at least, up to the date where this data was
collected and clustered). Of course, plotting recent crimes will help let anyone view the spatial
distribution of clusters. Plotting the clusters of crimes will help highlight any particular 
patterns that emerged. But, these patterns may not be statistically significant, so the hotspot
analysis aims to highlight chunks of Philadelphia have statistically significant concentrations of 
crime.

Each of these three layers complement each other. For example, one could see an area as a proper 
hotspot, and then perhaps see two clusters of crime in that hotspot. Finally, recent crimes can help
highlight the most recent observations and confirm the pattern is still active.

The first step is just plotting the recent crimes.

In [12]:
# Extract crime type from the OHE'd columns
crime_type_cols = [col for col in recent_crime.columns if col.startswith('crime_')]
for col in crime_type_cols:
    recent_crime[col] = pd.to_numeric(recent_crime[col], errors='coerce').fillna(0)
# Use idxmax() for a fast, vectorized operation to find the crime type
recent_crime['crime_type'] = recent_crime[crime_type_cols].idxmax(axis=1)

# Define a color map for each crime type
unique_types = recent_crime['crime_type'].unique()
cmap = plt.get_cmap('tab20', len(unique_types))
color_map = {crime: matplotlib.colors.rgb2hex(cmap(i)) for i, crime in enumerate(unique_types)}

# Load Philadelphia boundary GeoJSON
philly_boundary_url = "https://raw.githubusercontent.com/blackmad/neighborhoods/master/philadelphia.geojson"
philly_gdf = gpd.read_file(philly_boundary_url)

# Get the bounding box of Philadelphia to lock the map view
min_lon, min_lat, max_lon, max_lat = philly_gdf.total_bounds
map_bounds = [[min_lat, min_lon], [max_lat, max_lon]]

# Create Folium map of crime, centered at mean lat/lon
m_crime = folium.Map(
    location=[recent_crime['lat'].mean(), recent_crime['lon'].mean()],
    zoom_start=12,
    max_bounds=map_bounds, 
    min_zoom=12
)

# Add the Philadelphia boundary outline to the map
folium.GeoJson(
    philly_gdf[['geometry']], 
    style_function=lambda x: {'color': 'black', 'weight': 2, 'fillOpacity': 0.0},
    name="Philadelphia Boundary"
).add_to(m_crime)

# Create two separate layers for the crime points with new names
# NOTE: This should reflect yesterday's crime in the actual production version
aggregated_view = MarkerCluster(name="Crime Type (Aggregated)").add_to(m_crime)
detailed_view = folium.FeatureGroup(name="Crime Type").add_to(m_crime)

# Add colored markers to both layers
for _, row in recent_crime.iterrows():
    popup_text = row['crime_type'].replace('crime_', '')
    
    # Create a common marker style
    marker_args = {
        "location": [row['lat'], row['lon']],
        "radius": 5,
        "color": color_map[row['crime_type']],
        "fill": True,
        "fill_color": color_map[row['crime_type']],
        "fill_opacity": 0.7,
        "popup": popup_text 
    }
    
    # Add the marker to the aggregated (cluster) view
    folium.CircleMarker(**marker_args).add_to(aggregated_view)
    
    # Add an identical marker to the detailed view
    folium.CircleMarker(**marker_args).add_to(detailed_view)

# Add a layer control button to toggle between the views
folium.LayerControl().add_to(m_crime)

# Add legend for crime type
legend_html_start = '''
     <div style="position: fixed; 
     bottom: 50px; left: 50px; width: 250px; height: 400px; 
     border:2px solid grey; z-index:9999; font-size:14px;
     background-color:white; padding: 10px;">
     <b>Crime Type Legend</b><br>
     <div style="height: 90%; overflow-y: auto;">
     '''
legend_items = ""
for crime_type, color in color_map.items():
    clean_name = crime_type.replace('crime_', '')
    legend_items += f'&nbsp; <i class="fa fa-circle" style="color:{color}"></i> &nbsp; {clean_name}<br>'
legend_html_end = '''
     </div>
     </div>
     '''
full_legend_html = legend_html_start + legend_items + legend_html_end
m_crime.get_root().html.add_child(folium.Element(full_legend_html))

# Save as html
# m_crime.save("layered_crime_map.html")

m_crime

TODO: Add layer for clusters and perform hotspot analysis with a layer for that as well. Probably
need to add a difference in shape or outline for clusters, to distinquish between the recent crimes
and the clusters.