In [8]:
import folium
import hdbscan
import numpy as np

def find_clusters(coordinates, min_cluster_size):
    X = np.radians(coordinates)
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric='haversine')
    labels = clusterer.fit_predict(X)
    return labels

def display_clusters_on_map(coordinates, labels):
    map_center = [np.mean([lat for lat, lon in coordinates]), np.mean([lon for lat, lon in coordinates])]
    m = folium.Map(location=map_center, zoom_start=10)

    colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightred', 'darkblue', 'lightblue', 'darkgreen',
              'lightgreen', 'gray', 'black', 'pink', 'darkpurple', 'lightpurple']

    for coord, label in zip(coordinates, labels):
        folium.CircleMarker(
            location=[coord[0], coord[1]],
            radius=5,
            color=colors[label % len(colors)],
            fill=True,
            fill_color=colors[label % len(colors)],
            fill_opacity=0.7,
            popup=f'Cluster {label}'
        ).add_to(m)

    display(m)

coordinates = [
    (37.758057, -122.43541),  # San Francisco
    (37.795388, -122.422453),  
    (40.745224, -73.978297),  # New York
    (40.755319, -73.993114),
    (40.75926, -73.98986),
    (40.765823, -73.987169),
    (40.745224, -73.978297),  # New York
    (40.755319, -73.993114),
    (40.75926, -73.98986),
    (40.765823, -73.987169)
    
]

min_cluster_size = 5  

labels = find_clusters(coordinates, min_cluster_size)

print(find_clusters(coordinates, min_cluster_size))

display_clusters_on_map(coordinates, labels)

[-1 -1 -1 -1 -1 -1 -1 -1 -1 -1]


In [14]:
import folium
import hdbscan
import numpy as np
from collections import Counter

def find_clusters(coordinates, min_cluster_size):
    """
    Find clusters in the given coordinates using HDBSCAN.
    
    Parameters:
        coordinates (list): List of tuples containing latitude and longitude.
        min_cluster_size (int): The minimum size of clusters.

    Returns:
        labels (np.array): Cluster labels for each point.
    """
    X = np.radians(coordinates)
    print('X ', X)
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric='haversine')
    print(clusterer)
    labels = clusterer.fit_predict(X)
    print('labels', labels)
    return labels

def get_largest_cluster_location(coordinates, labels):
    """
    Get the location (latitude, longitude) of the largest cluster.
    
    Parameters:
        coordinates (list): List of tuples containing latitude and longitude.
        labels (np.array): Cluster labels for each point.

    Returns:
        largest_cluster_location (tuple): Latitude and longitude of the largest cluster's centroid.
    """
    cluster_sizes = Counter(labels)
    largest_cluster_label = max(cluster_sizes, key=cluster_sizes.get)
    
    # Get points belonging to the largest cluster
    largest_cluster_points = [coord for coord, label in zip(coordinates, labels) if label == largest_cluster_label]

    # Calculate the centroid of the largest cluster
    largest_cluster_location = (
        np.mean([lat for lat, lon in largest_cluster_points]),
        np.mean([lon for lat, lon in largest_cluster_points])
    )
    
    return largest_cluster_location

def display_clusters_on_map(coordinates, labels, largest_cluster_location):
    """
    Display clusters on a map using Folium and highlight the largest cluster.
    
    Parameters:
        coordinates (list): List of tuples containing latitude and longitude.
        labels (np.array): Cluster labels for each point.
        largest_cluster_location (tuple): Latitude and longitude of the largest cluster's centroid.
    """
    map_center = [np.mean([lat for lat, lon in coordinates]), np.mean([lon for lat, lon in coordinates])]
    m = folium.Map(location=map_center, zoom_start=10)

    # Create a color palette for the clusters
    colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightred', 'darkblue', 'lightblue', 'darkgreen',
              'lightgreen', 'gray', 'black', 'pink', 'darkpurple', 'lightpurple']

    for coord, label in zip(coordinates, labels):
        folium.CircleMarker(
            location=[coord[0], coord[1]],
            radius=5,
            color=colors[label % len(colors)],
            fill=True,
            fill_color=colors[label % len(colors)],
            fill_opacity=0.7,
            popup=f'Cluster {label}'
        ).add_to(m)

    # Highlight the largest cluster
    folium.Marker(
        location=[largest_cluster_location[0], largest_cluster_location[1]],
        icon=folium.Icon(color='green'),
        popup='Largest Cluster'
    ).add_to(m)

    display(m)

coordinates = [
    (37.758057, -122.43541),  # San Francisco
    (37.795388, -122.422453),  
    (40.7128, -74.0060),  # New York City
    (40.7589, -73.9851),  # Times Square
    (40.7060, -74.0088),  # Battery Park
    (40.7488, -73.9857),  # Grand Central Terminal
    (40.7309, -73.9872),  # Union Square
    (40.7829, -73.9654),  # Central Park
    (40.7214, -74.0060),  # Greenwich Village
    (40.7493, -73.9910),  # Rockefeller Center
    (40.7590, -73.9845),  # Bryant Park
    (40.7484, -73.9857),  # Chrysler Building
    (40.7644, -73.9737),  # Sutton Place
    (40.7696, -73.9606),  # Yorkville
    (40.7580, -73.9855),  # Theater District
    (40.7182, -74.0077),  # Tribeca
    (40.7749, -73.9822),  # Columbus Circle
    (40.7418, -73.9898),  # SoHo
    (40.7492, -73.9918),  # Midtown Manhattan
    (40.7586, -73.9787),  # United Nations Headquarters
    (40.7453, -73.9992),  # Chelsea
    (40.7750, -73.9665),  # Upper East Side

    (40.7128, -74.0060),  # New York City
    (40.7589, -73.9851),  # Times Square
    (40.7060, -74.0088),  # Battery Park
    (40.7488, -73.9857),  # Grand Central Terminal
    (40.7309, -73.9872),  # Union Square
    (40.7829, -73.9654),  # Central Park
    (40.7214, -74.0060),  # Greenwich Village
    (40.7493, -73.9910),  # Rockefeller Center
    (40.7590, -73.9845),  # Bryant Park
    (40.7484, -73.9857),  # Chrysler Building
    (40.7644, -73.9737),  # Sutton Place
    (40.7696, -73.9606),  # Yorkville
    (40.7580, -73.9855),  # Theater District
    (40.7182, -74.0077),  # Tribeca
    (40.7749, -73.9822),  # Columbus Circle
    (40.7418, -73.9898),  # SoHo
    (40.7492, -73.9918),  # Midtown Manhattan
    (40.7586, -73.9787),  # United Nations Headquarters
    (40.7453, -73.9992),  # Chelsea
    (40.7750, -73.9665),  # Upper East Side

    (40.7128, -74.0060),  # New York City
    (40.7589, -73.9851),  # Times Square
    (40.7060, -74.0088),  # Battery Park
    (40.7488, -73.9857),  # Grand Central Terminal
    (40.7309, -73.9872),  # Union Square
    (40.7829, -73.9654),  # Central Park
    (40.7214, -74.0060),  # Greenwich Village
    (40.7493, -73.9910),  # Rockefeller Center
    (40.7590, -73.9845),  # Bryant Park
    (40.7484, -73.9857),  # Chrysler Building
    (40.7644, -73.9737),  # Sutton Place
    (40.7696, -73.9606),  # Yorkville
    (40.7580, -73.9855),  # Theater District
    (40.7182, -74.0077),  # Tribeca
    (40.7749, -73.9822),  # Columbus Circle
    (40.7418, -73.9898),  # SoHo
    (40.7492, -73.9918),  # Midtown Manhattan
    (40.7586, -73.9787),  # United Nations Headquarters
    (40.7453, -73.9992),  # Chelsea
    (40.7750, -73.9665),  # Upper East Side

    (40.7128, -74.0060),  # New York City
    (40.7589, -73.9851),  # Times Square
    (40.7060, -74.0088),  # Battery Park
    (40.7488, -73.9857),  # Grand Central Terminal
    (40.7309, -73.9872),  # Union Square
    (40.7829, -73.9654),  # Central Park
    (40.7214, -74.0060),  # Greenwich Village
    (40.7493, -73.9910),  # Rockefeller Center
    (40.7590, -73.9845),  # Bryant Park
    (40.7484, -73.9857),  # Chrysler Building
    (40.7644, -73.9737),  # Sutton Place
    (40.7696, -73.9606),  # Yorkville
    (40.7580, -73.9855),  # Theater District
    (40.7182, -74.0077),  # Tribeca
    (40.7749, -73.9822),  # Columbus Circle
    (40.7418, -73.9898),  # SoHo
    (40.7492, -73.9918),  # Midtown Manhattan
    (40.7586, -73.9787),  # United Nations Headquarters
    (40.7453, -73.9992),  # Chelsea
    (40.7750, -73.9665),  # Upper East Side

    (40.7128, -74.0060),  # New York City
    (40.7589, -73.9851),  # Times Square
    (40.7060, -74.0088),  # Battery Park
    (40.7488, -73.9857),  # Grand Central Terminal
    (40.7309, -73.9872),  # Union Square
    (40.7829, -73.9654),  # Central Park
    (40.7214, -74.0060),  # Greenwich Village
    (40.7493, -73.9910),  # Rockefeller Center
    (40.7590, -73.9845),  # Bryant Park
    (40.7484, -73.9857),  # Chrysler Building
    (40.7644, -73.9737),  # Sutton Place
    (40.7696, -73.9606),  # Yorkville
    (40.7580, -73.9855),  # Theater District
    (40.7182, -74.0077),  # Tribeca
    (40.7749, -73.9822),  # Columbus Circle
    (40.7418, -73.9898),  # SoHo
    (40.7492, -73.9918),  # Midtown Manhattan
    (40.7586, -73.9787),  # United Nations Headquarters
    (40.7453, -73.9992),  # Chelsea
    (40.7750, -73.9665),  # Upper East Side

    (40.7128, -74.0060),  # New York City
    (40.7589, -73.9851),  # Times Square
    (40.7060, -74.0088),  # Battery Park
    (40.7488, -73.9857),  # Grand Central Terminal
    (40.7309, -73.9872),  # Union Square
    (40.7829, -73.9654),  # Central Park
    (40.7214, -74.0060),  # Greenwich Village
    (40.7493, -73.9910),  # Rockefeller Center
    (40.7590, -73.9845),  # Bryant Park
    (40.7484, -73.9857),  # Chrysler Building
    (40.7644, -73.9737),  # Sutton Place
    (40.7696, -73.9606),  # Yorkville
    (40.7580, -73.9855),  # Theater District
    (40.7182, -74.0077),  # Tribeca
    (40.7749, -73.9822),  # Columbus Circle
    (40.7418, -73.9898),  # SoHo
    (40.7492, -73.9918),  # Midtown Manhattan
    (40.7586, -73.9787),  # United Nations Headquarters
    (40.7453, -73.9992),  # Chelsea
    (40.7750, -73.9665),  # Upper East Side
 
]
min_cluster_size = 5  # Adjust based on your data

# Find clusters
labels = find_clusters(coordinates, min_cluster_size)

# Get the location of the largest cluster
largest_cluster_location = get_largest_cluster_location(coordinates, labels)

print(largest_cluster_location)

# Display clusters on a map, highlighting the largest cluster
display_clusters_on_map(coordinates, labels, largest_cluster_location)

X  [[ 0.65900241 -2.13690103]
 [ 0.65965396 -2.13667488]
 [ 0.71057241 -1.29164837]
 [ 0.711377   -1.29128359]
 [ 0.71045373 -1.29169724]
 [ 0.71120073 -1.29129406]
 [ 0.71088831 -1.29132024]
 [ 0.71179588 -1.29093976]
 [ 0.71072251 -1.29164837]
 [ 0.71120945 -1.29138657]
 [ 0.71137875 -1.29127312]
 [ 0.71119374 -1.29129406]
 [ 0.711473   -1.29108462]
 [ 0.71156375 -1.29085599]
 [ 0.7113613  -1.29129057]
 [ 0.71066666 -1.29167804]
 [ 0.71165626 -1.29123298]
 [ 0.71107855 -1.29136562]
 [ 0.71120771 -1.29140053]
 [ 0.71137177 -1.29117189]
 [ 0.71113964 -1.29152968]
 [ 0.711658   -1.29095896]
 [ 0.71057241 -1.29164837]
 [ 0.711377   -1.29128359]
 [ 0.71045373 -1.29169724]
 [ 0.71120073 -1.29129406]
 [ 0.71088831 -1.29132024]
 [ 0.71179588 -1.29093976]
 [ 0.71072251 -1.29164837]
 [ 0.71120945 -1.29138657]
 [ 0.71137875 -1.29127312]
 [ 0.71119374 -1.29129406]
 [ 0.711473   -1.29108462]
 [ 0.71156375 -1.29085599]
 [ 0.7113613  -1.29129057]
 [ 0.71066666 -1.29167804]
 [ 0.71165626 -1.29123298

In [1]:
import folium
import hdbscan
import numpy as np
from scipy.spatial.distance import pdist, squareform
from geopy.distance import geodesic

def haversine(coord1, coord2):
    """
    Calculate the haversine distance between two coordinates in kilometers.
    """
    return geodesic(coord1, coord2).kilometers

def custom_metric(X, Y, **kwargs):
    """
    Custom metric considering haversine distance with minimum geographic size.
    """
    distances = pdist(np.radians(np.vstack([X, Y])), haversine)
    return squareform(distances)

def find_clusters(coordinates, min_cluster_size, min_geographic_size_km):
    X = np.radians(coordinates)
    clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric=custom_metric, alpha=min_geographic_size_km)
    labels = clusterer.fit_predict(X)
    return labels

def display_clusters_on_map(coordinates, labels):
    map_center = [np.mean([lat for lat, lon in coordinates]), np.mean([lon for lat, lon in coordinates])]
    m = folium.Map(location=map_center, zoom_start=10)

    # Create a color palette for the clusters
    colors = ['red', 'blue', 'green', 'purple', 'orange', 'darkred', 'lightred', 'darkblue', 'lightblue', 'darkgreen',
              'lightgreen', 'gray', 'black', 'pink', 'darkpurple', 'lightpurple']

    for coord, label in zip(coordinates, labels):
        folium.CircleMarker(
            location=[coord[0], coord[1]],
            radius=5,
            color=colors[label % len(colors)],
            fill=True,
            fill_color=colors[label % len(colors)],
            fill_opacity=0.7,
            popup=f'Cluster {label}'
        ).add_to(m)

    display(m)

# Example usage
coordinates = [
    (37.758057, -122.43541),  # San Francisco
    (37.795388, -122.422453),  
    (40.7128, -74.0060),  # New York City
    (40.7589, -73.9851),  # Times Square
    (40.7060, -74.0088),  # Battery Park
    (40.7488, -73.9857),  # Grand Central Terminal
    (40.7309, -73.9872),  # Union Square
    (40.7829, -73.9654),  # Central Park
    (40.7214, -74.0060),  # Greenwich Village
    (40.7493, -73.9910),  # Rockefeller Center
    (40.7590, -73.9845),  # Bryant Park
    (40.7484, -73.9857),  # Chrysler Building
    (40.7644, -73.9737),  # Sutton Place
    (40.7696, -73.9606),  # Yorkville
    (40.7580, -73.9855),  # Theater District
    (40.7182, -74.0077),  # Tribeca
    (40.7749, -73.9822),  # Columbus Circle
    (40.7418, -73.9898),  # SoHo
    (40.7492, -73.9918),  # Midtown Manhattan
    (40.7586, -73.9787),  # United Nations Headquarters
    (40.7453, -73.9992),  # Chelsea
    (40.7750, -73.9665),  # Upper East Side

    (40.7128, -74.0060),  # New York City
    (40.7589, -73.9851),  # Times Square
    (40.7060, -74.0088),  # Battery Park
    (40.7488, -73.9857),  # Grand Central Terminal
    (40.7309, -73.9872),  # Union Square
    (40.7829, -73.9654),  # Central Park
    (40.7214, -74.0060),  # Greenwich Village
    (40.7493, -73.9910),  # Rockefeller Center
    (40.7590, -73.9845),  # Bryant Park
    (40.7484, -73.9857),  # Chrysler Building
    (40.7644, -73.9737),  # Sutton Place
    (40.7696, -73.9606),  # Yorkville
    (40.7580, -73.9855),  # Theater District
    (40.7182, -74.0077),  # Tribeca
    (40.7749, -73.9822),  # Columbus Circle
    (40.7418, -73.9898),  # SoHo
    (40.7492, -73.9918),  # Midtown Manhattan
    (40.7586, -73.9787),  # United Nations Headquarters
    (40.7453, -73.9992),  # Chelsea
    (40.7750, -73.9665),  # Upper East Side

    (40.7128, -74.0060),  # New York City
    (40.7589, -73.9851),  # Times Square
    (40.7060, -74.0088),  # Battery Park
    (40.7488, -73.9857),  # Grand Central Terminal
    (40.7309, -73.9872),  # Union Square
    (40.7829, -73.9654),  # Central Park
    (40.7214, -74.0060),  # Greenwich Village
    (40.7493, -73.9910),  # Rockefeller Center
    (40.7590, -73.9845),  # Bryant Park
    (40.7484, -73.9857),  # Chrysler Building
    (40.7644, -73.9737),  # Sutton Place
    (40.7696, -73.9606),  # Yorkville
    (40.7580, -73.9855),  # Theater District
    (40.7182, -74.0077),  # Tribeca
    (40.7749, -73.9822),  # Columbus Circle
    (40.7418, -73.9898),  # SoHo
    (40.7492, -73.9918),  # Midtown Manhattan
    (40.7586, -73.9787),  # United Nations Headquarters
    (40.7453, -73.9992),  # Chelsea
    (40.7750, -73.9665),  # Upper East Side

    (40.7128, -74.0060),  # New York City
    (40.7589, -73.9851),  # Times Square
    (40.7060, -74.0088),  # Battery Park
    (40.7488, -73.9857),  # Grand Central Terminal
    (40.7309, -73.9872),  # Union Square
    (40.7829, -73.9654),  # Central Park
    (40.7214, -74.0060),  # Greenwich Village
    (40.7493, -73.9910),  # Rockefeller Center
    (40.7590, -73.9845),  # Bryant Park
    (40.7484, -73.9857),  # Chrysler Building
    (40.7644, -73.9737),  # Sutton Place
    (40.7696, -73.9606),  # Yorkville
    (40.7580, -73.9855),  # Theater District
    (40.7182, -74.0077),  # Tribeca
    (40.7749, -73.9822),  # Columbus Circle
    (40.7418, -73.9898),  # SoHo
    (40.7492, -73.9918),  # Midtown Manhattan
    (40.7586, -73.9787),  # United Nations Headquarters
    (40.7453, -73.9992),  # Chelsea
    (40.7750, -73.9665),  # Upper East Side

    (40.7128, -74.0060),  # New York City
    (40.7589, -73.9851),  # Times Square
    (40.7060, -74.0088),  # Battery Park
    (40.7488, -73.9857),  # Grand Central Terminal
    (40.7309, -73.9872),  # Union Square
    (40.7829, -73.9654),  # Central Park
    (40.7214, -74.0060),  # Greenwich Village
    (40.7493, -73.9910),  # Rockefeller Center
    (40.7590, -73.9845),  # Bryant Park
    (40.7484, -73.9857),  # Chrysler Building
    (40.7644, -73.9737),  # Sutton Place
    (40.7696, -73.9606),  # Yorkville
    (40.7580, -73.9855),  # Theater District
    (40.7182, -74.0077),  # Tribeca
    (40.7749, -73.9822),  # Columbus Circle
    (40.7418, -73.9898),  # SoHo
    (40.7492, -73.9918),  # Midtown Manhattan
    (40.7586, -73.9787),  # United Nations Headquarters
    (40.7453, -73.9992),  # Chelsea
    (40.7750, -73.9665),  # Upper East Side

    (40.7128, -74.0060),  # New York City
    (40.7589, -73.9851),  # Times Square
    (40.7060, -74.0088),  # Battery Park
    (40.7488, -73.9857),  # Grand Central Terminal
    (40.7309, -73.9872),  # Union Square
    (40.7829, -73.9654),  # Central Park
    (40.7214, -74.0060),  # Greenwich Village
    (40.7493, -73.9910),  # Rockefeller Center
    (40.7590, -73.9845),  # Bryant Park
    (40.7484, -73.9857),  # Chrysler Building
    (40.7644, -73.9737),  # Sutton Place
    (40.7696, -73.9606),  # Yorkville
    (40.7580, -73.9855),  # Theater District
    (40.7182, -74.0077),  # Tribeca
    (40.7749, -73.9822),  # Columbus Circle
    (40.7418, -73.9898),  # SoHo
    (40.7492, -73.9918),  # Midtown Manhattan
    (40.7586, -73.9787),  # United Nations Headquarters
    (40.7453, -73.9992),  # Chelsea
    (40.7750, -73.9665),  # Upper East Side
]

min_cluster_size = 2  # Adjust based on your data
min_geographic_size_km = 50  # Adjust based on your desired minimum geographic size

# Find clusters
labels = find_clusters(coordinates, min_cluster_size, min_geographic_size_km)

# Display clusters on a map
display_clusters_on_map(coordinates, labels)


ValueError: Alpha must be a positive float value greater than 0!