In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import matplotlib.patheffects as PathEffects
from matplotlib.patches import Circle
import folium
from folium.plugins import MarkerCluster
import random
import colorsys

# Load the data
def load_data(file_path):
    # Check file extension and load accordingly
    if file_path.endswith('.xlsx'):
        df = pd.read_excel(file_path)
    elif file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    else:
        raise ValueError("Unsupported file format. Please use .xlsx or .csv")

    return df



In [2]:
def extract_stops(df):
    # Extract unique stops (origin stops)
    origin_stops = df[['ON_STOP_NAMEE', 'ON_Lat', 'ON_Long']].drop_duplicates()
    origin_stops.columns = ['STOP_NAME', 'LAT', 'LONG']

    # Extract unique stops (destination stops)
    dest_stops = df[['OFF_STOP_NAMEE', 'OFF_Lat', 'OFF_Long']].drop_duplicates()
    dest_stops.columns = ['STOP_NAME', 'LAT', 'LONG']

    # Combine all unique stops
    all_stops = pd.concat([origin_stops, dest_stops]).drop_duplicates().reset_index(drop=True)
    xlsx = pd.read_excel("ad.xlsx")
    xlsx =xlsx.drop_duplicates(subset='STOP_ID')
    xlsx["STOP_NAME"] = xlsx["STOP_NAMEE"]
    result = pd.merge(all_stops, xlsx, on='STOP_NAME', how='inner')
    all_stops = result.drop_duplicates(subset='STOP_ID')
    # Remove rows with NaN values in coordinates
    all_stops = all_stops.dropna(subset=['LAT', 'LONG'])

    return all_stops

def cluster_stops(stops_df, max_distance_meters=3000, min_samples=1, algorithm='ball_tree', metric='haversine'):
    """
    Cluster bus stops based on geographic proximity.

    Parameters:
    -----------
    stops_df : pandas DataFrame
        DataFrame containing bus stops with LAT and LONG columns

    max_distance_meters : float, default=100
        Maximum distance (in meters) for stops to be considered part of the same cluster.
        Larger values will create fewer, larger clusters.
        Smaller values will create more, smaller clusters.

    min_samples : int, default=1
        The minimum number of samples in a neighborhood for a point to be considered a core point.

    algorithm : {'auto', 'ball_tree', 'kd_tree', 'brute'}, default='ball_tree'
        The algorithm to compute the nearest neighbors.

    metric : str, default='haversine'
        Distance metric to use.

    Returns:
    --------
    pandas DataFrame
        Original DataFrame with an additional 'CLUSTER' column indicating cluster membership
    """
    # Ensure there are no NaN values in the coordinates
    stops_df = stops_df.dropna(subset=['LAT', 'LONG'])

    # Convert distance in meters to approximately equivalent in coordinates
    # This is much more precise than the previous approximation
    # 1 degree of latitude is approximately 111 km (varies slightly with latitude)
    # 1 degree of longitude varies with latitude: 111 km * cos(latitude)

    # Get average latitude to calculate appropriate longitude scaling
    avg_lat_radians = np.radians(stops_df['LAT'].mean())

    # Calculate epsilon in degrees for DBSCAN
    # More precise formula for the given geographic area
    latitude_epsilon = max_distance_meters / 111000  # for latitude
    longitude_epsilon = max_distance_meters / (111000 * np.cos(avg_lat_radians))  # for longitude

    # Use the average of the two as epsilon, or use a more sophisticated approach if needed
    epsilon = (latitude_epsilon + longitude_epsilon) / 2

    # For very small datasets, adjust epsilon to ensure reasonable clustering
    if len(stops_df) < 10:
        # Slightly reduce epsilon to avoid putting everything in one cluster
        epsilon *= 0.5

    # Prepare coordinates for clustering
    coords = stops_df[['LAT', 'LONG']].values

    # Apply DBSCAN clustering
    db = DBSCAN(
        eps=epsilon,
        min_samples=min_samples,
        algorithm=algorithm,
        metric=metric
    ).fit(np.radians(coords))

    # Get cluster labels
    cluster_labels = db.labels_

    # Log clustering results
    num_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    print(f"DBSCAN found {num_clusters} clusters")

    # If all points are in one cluster, try with a smaller epsilon
    if num_clusters == 1 and len(stops_df) > 3:
        print("All points were assigned to a single cluster. Trying with a smaller distance...")
        return cluster_stops(stops_df, max_distance_meters=max_distance_meters/2,
                         min_samples=min_samples, algorithm=algorithm, metric=metric)

    # Add cluster labels to the DataFrame
    stops_df['CLUSTER'] = cluster_labels

    return stops_df



In [9]:
def create_consolidated_stops(clustered_stops):
    # Group by cluster
    clusters = clustered_stops.groupby('CLUSTER')

    # Initialize lists to store consolidated information
    consolidated_ids = []
    consolidated_names = []
    consolidated_lats = []
    consolidated_longs = []
    stop_ids_in_cluster = []
    num_stops_in_cluster = []

    # Process each cluster
    for cluster_id, group in clusters:
        # Create a comma-separated list of stop names in this cluster
        stop_names = ';'.join(group['STOP_ID'].astype(str))
        stop_ids_in_cluster.append(stop_names)

        # Use the most common name in the cluster
        most_common_name = group['STOP_NAME'].mode()[0]
        consolidated_names.append(most_common_name)

        # Use centroid of the cluster for coordinates
        avg_lat = group['LAT'].mean()
        avg_long = group['LONG'].mean()
        consolidated_lats.append(avg_lat)
        consolidated_longs.append(avg_long)

        # Use cluster ID as the consolidated ID
        consolidated_ids.append(f"C{cluster_id}")

        # Count number of stops in this cluster
        num_stops_in_cluster.append(len(group))

    # Create a DataFrame with consolidated information
    result_df = pd.DataFrame({
        'CONSOLIDATED_ID': consolidated_ids,
        'REPRESENTATIVE_STOP': consolidated_names,
        'STOPS_IN_CLUSTER': stop_ids_in_cluster,
        'NUM_STOPS': num_stops_in_cluster,
        'LAT': consolidated_lats,
        'LONG': consolidated_longs
    })
    top_6_highest_num_stops = result_df.sort_values(by='NUM_STOPS', ascending=False).head(6)
    print(top_6_highest_num_stops)

    return result_df, clustered_stops

def generate_distinct_colors(n):
    """Generate n visually distinct colors"""
    colors = []
    for i in range(n):
        hue = i/n
        saturation = 0.7 + random.uniform(-0.2, 0.2)  # Slightly randomize saturation
        lightness = 0.5 + random.uniform(-0.1, 0.1)   # Slightly randomize lightness
        r, g, b = colorsys.hls_to_rgb(hue, lightness, saturation)
        colors.append((r, g, b))
    return colors

In [4]:
def visualize_clusters_matplotlib(original_stops, consolidated_stops):
    # Create a figure
    plt.figure(figsize=(15, 10))

    # Get unique cluster IDs
    unique_clusters = original_stops['CLUSTER'].unique()

    # Generate colors for clusters
    colors = generate_distinct_colors(len(unique_clusters))
    color_map = {cluster: colors[i] for i, cluster in enumerate(unique_clusters)}

    # Plot original stops
    for cluster in unique_clusters:
        cluster_points = original_stops[original_stops['CLUSTER'] == cluster]
        plt.scatter(cluster_points['LONG'], cluster_points['LAT'],
                   color=color_map[cluster], marker='o', s=50,
                   label=f'Cluster {cluster}' if len(cluster_points) > 1 else None)

    # Plot cluster centers with circles
    for idx, row in consolidated_stops.iterrows():
        cluster_id = int(row['CONSOLIDATED_ID'].replace('C', ''))
        # Draw a circle to represent the cluster
        circle = Circle((row['LONG'], row['LAT']),
                        0.0005 * (1 + row['NUM_STOPS'] * 0.5),  # Size based on number of stops
                        alpha=0.3,
                        color=color_map[cluster_id])
        plt.gca().add_patch(circle)

        # Add text labels for the consolidated stops
        txt = plt.text(row['LONG'], row['LAT'], row['REPRESENTATIVE_STOP'],
                       fontsize=9, ha='center', va='center', fontweight='bold')
        txt.set_path_effects([PathEffects.withStroke(linewidth=3, foreground='white')])

    # Add legend for clusters with multiple stops
    handles, labels = plt.gca().get_legend_handles_labels()
    if handles:
        plt.legend(loc='best')

    plt.title('Bus Stops Clustering Results')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.tight_layout()

    # Save the figure
    plt.savefig('bus_stops_clusters.png', dpi=300)
    plt.close()

    print(f"Static visualization saved as 'bus_stops_clusters.png'")

In [5]:
def visualize_clusters_interactive(original_stops, consolidated_stops):
    # Create a map centered at the mean of all stops
    map_center = [original_stops['LAT'].mean(), original_stops['LONG'].mean()]
    mymap = folium.Map(location=map_center, zoom_start=13, tiles='OpenStreetMap')

    # Define color map for clusters
    unique_clusters = original_stops['CLUSTER'].unique()

    # Generate a color dictionary for clusters
    cluster_colors = {}
    for i, cluster_id in enumerate(unique_clusters):
        # Create a HSL color with good saturation and lightness
        hue = i / len(unique_clusters)
        cluster_colors[cluster_id] = f'hsl({int(hue * 360)}, 70%, 50%)'

    # First add the cluster circles (so they appear behind the points)
    cluster_group = folium.FeatureGroup(name="Clusters")

    for idx, stop in consolidated_stops.iterrows():
        cluster_id = int(stop['CONSOLIDATED_ID'].replace('C', ''))
        color = cluster_colors.get(cluster_id, 'gray')

        # Create popup content
        popup_text = f"<b>Cluster: {stop['CONSOLIDATED_ID']}</b><br>"
        popup_text += f"Representative Stop: {stop['REPRESENTATIVE_STOP']}<br>"
        popup_text += f"Number of Stops: {stop['NUM_STOPS']}<br>"
        popup_text += f"<hr><b>Stops in this cluster:</b><br>{stop['STOPS_IN_CLUSTER'].replace(',', '<br>')}"

        # Determine circle size based on number of stops (min 50, scales up with more stops)
        circle_radius = max(50, 30 * stop['NUM_STOPS'])

        # Create a larger, transparent circle to show the cluster area
        folium.Circle(
            location=[stop['LAT'], stop['LONG']],
            radius=circle_radius,  # Radius in meters
            popup=folium.Popup(popup_text, max_width=300),
            color=color,
            weight=2,
            fill=True,
            fill_opacity=0.2,
            fill_color=color
        ).add_to(cluster_group)

        # Add a marker for the cluster center
        folium.CircleMarker(
            location=[stop['LAT'], stop['LONG']],
            radius=8,  # Size of circle marker
            popup=folium.Popup(f"<b>Cluster Center:</b><br>{stop['REPRESENTATIVE_STOP']}", max_width=200),
            color=color,
            fill=True,
            fill_opacity=0.8,
            fill_color=color
        ).add_to(cluster_group)

    # Add the cluster group to the map
    cluster_group.add_to(mymap)

    # Create a feature group for original stops
    original_stops_group = folium.FeatureGroup(name="Original Bus Stops")

    # Add markers for original stops
    for idx, stop in original_stops.iterrows():
        cluster_id = stop['CLUSTER']
        color = cluster_colors.get(cluster_id, 'blue')

        popup_text = f"<b>Stop:</b> {stop['STOP_NAME']}<br>"
        popup_text += f"<b>Cluster:</b> C{stop['CLUSTER']}<br>"
        popup_text += f"<b>Coordinates:</b> {stop['LAT']:.6f}, {stop['LONG']:.6f}"

        folium.CircleMarker(
            location=[stop['LAT'], stop['LONG']],
            radius=4,  # Smaller than cluster centers
            popup=folium.Popup(popup_text, max_width=200),
            color=color,
            fill=True,
            fill_opacity=0.7,
            fill_color=color,
            weight=1
        ).add_to(original_stops_group)

    # Add the original stops group to the map
    original_stops_group.add_to(mymap)

    # Add custom legend
    legend_html = '''
    <div style="position: fixed;
        bottom: 50px; right: 50px; width: 180px; height: auto;
        background-color: white; border:2px solid grey; z-index:9999;
        font-size:14px; padding: 10px; border-radius: 6px;">
        <p><b>Map Legend:</b></p>
        <p>• <span style="color:#555555">●</span> Original Bus Stops</p>
        <p>• <span style="color:#555555">◯</span> Cluster Boundaries</p>
        <p>• <span style="color:#555555">●</span> Cluster Centers</p>
        <hr style="margin: 5px 0;">
        <p style="font-size: 12px;">Toggle layers using the control panel in the top-right corner</p>
    </div>
    '''
    mymap.get_root().html.add_child(folium.Element(legend_html))

    # Add layer control to toggle between different layers
    folium.LayerControl(position='topright', collapsed=False).add_to(mymap)

    # Add fullscreen button
    folium.plugins.Fullscreen(
        position='topright',
        title='Expand to fullscreen',
        title_cancel='Exit fullscreen',
        force_separate_button=True
    ).add_to(mymap)

    # Add measure tool for distances
    folium.plugins.MeasureControl(
        position='topright',
        primary_length_unit='meters',
        secondary_length_unit='kilometers'
    ).add_to(mymap)

    # Save the interactive map
    output_file = 'bus_stops_interactive_map.html'
    mymap.save(output_file)

    print(f"Interactive map saved as '{output_file}'")

    return mymap


In [None]:




def export_to_csv(consolidated_stops, original_stops, output_file='consolidated_bus_stops.csv'):
    # Export the consolidated stops to CSV
    consolidated_stops.to_csv(output_file, index=False)
    print(f"Consolidated stops exported to {output_file}")

    # Also export the original stops with cluster information for reference
    original_stops.to_csv('original_stops_with_clusters.csv', index=False)
    print(f"Original stops with cluster information exported to 'original_stops_with_clusters.csv'")

def main():
    # File path - update this to your actual file path
    file_path = '/content/final_with_districts.csv'  # Change to your actual file path

    # Clustering parameters
    max_distance = 1.7  # Maximum distance for stops to be considered in the same cluster

    print(f"Loading data from {file_path}...")
    try:
        # Load data
        df = load_data(file_path)

        # Extract unique stops
        all_stops = extract_stops(df)
        print(f"Found {len(all_stops)} unique bus stops")

        # Cluster stops
        print(f"Clustering stops with a maximum distance of {max_distance} meters...")
        clustered_stops = cluster_stops(all_stops, max_distance)

        # Create consolidated stops
        print("Consolidating stops...")
        consolidated_stops, original_with_clusters = create_consolidated_stops(clustered_stops)

        # Count clusters and stops
        single_stop_clusters = consolidated_stops[consolidated_stops['NUM_STOPS'] == 1].shape[0]
        multi_stop_clusters = consolidated_stops[consolidated_stops['NUM_STOPS'] > 1].shape[0]

        print(f"Created {len(consolidated_stops)} consolidated stops:")
        print(f" - {single_stop_clusters} single-stop clusters")
        print(f" - {multi_stop_clusters} multi-stop clusters")
        print(f" - Reduced from {len(all_stops)} original stops to {len(consolidated_stops)} consolidated stops")

        # Export to CSV
        export_to_csv(consolidated_stops, original_with_clusters)

        # Generate visualizations
        print("Generating visualizations...")
        visualize_clusters_matplotlib(original_with_clusters, consolidated_stops)
        visualize_clusters_interactive(original_with_clusters, consolidated_stops)

        print("Processing complete!")

    except Exception as e:
        print(f"Error: {str(e)}")

if __name__ == "__main__":
    main()

Loading data from /content/final_with_districts.csv...
Found 4396 unique bus stops
Clustering stops with a maximum distance of 1.7 meters...
DBSCAN found 2244 clusters
Consolidating stops...
    CONSOLIDATED_ID                                REPRESENTATIVE_STOP  \
222            C222  CENTRAL MARKET/<br>Central Market, Des Voeux R...   
142            C142              THE FAMILY PLANNING ASSOCIATION OF HK   
221            C221            Alexandra House, Des Voeux Road Central   
153            C153                                   ADMIRALTY CENTRE   
492            C492                                     Wah Fu (North)   
127            C127   ARSENAL STREET/<br>Arsenal Street, Hennessy Road   

                                      STOPS_IN_CLUSTER  NUM_STOPS        LAT  \
222  8001;8004;24;127;27;23;163;8002;8010;3273;886;...         30  22.285354   
142  4162;1323;10000119;9025;9026;818;4184;12713;10...         16  22.323344   
221  778;128;21;388;12473;258;647;129;22;589;257;8

  plt.tight_layout()
