In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import MarkerCluster
from sodapy import Socrata

# Set random seed for reproducibility
np.random.seed(42)

# Step 1: Load Real Data via Socrata API
# Initialize Socrata client
# Replace 'your_app_token' with your actual app token from NYC Open Data (optional)
# Get app token from: https://data.cityofnewyork.us/profile/edit/developer_settings
client = Socrata("data.cityofnewyork.us", "MTQfj5W0YCakhn6hZuqrYaVML")  # Use None for no token, or add your token

# Fetch NYC Bus Stop Shelters dataset (ID: qafz-7myz)
results = client.get("qafz-7myz", limit=10000)

# Convert to DataFrame
df = pd.DataFrame.from_records(results)

# Print column names to identify latitude and longitude fields
print("Dataset columns:", df.columns.tolist())

# Step 2: Data Preprocessing
# The dataset may use 'the_geom' (WKT format) or other column names
# Check if 'the_geom' exists and extract coordinates
if 'the_geom' in df.columns:
    # Extract latitude and longitude from 'the_geom' (e.g., POINT (-73.987 40.757))
    df['longitude'] = df['the_geom'].apply(lambda x: float(x.split('(')[1].split(' ')[0]) if pd.notnull(x) else np.nan)
    df['latitude'] = df['the_geom'].apply(lambda x: float(x.split(' ')[1].split(')')[0]) if pd.notnull(x) else np.nan)
else:
    # Adjust these names based on actual columns (e.g., 'lat', 'lon')
    # Update after checking df.columns
    df = df.rename(columns={'lat': 'latitude', 'lon': 'longitude'})  # Example, adjust as needed

# Add synthetic population density as a placeholder (replace with real data if available)
df['population_density'] = np.random.uniform(low=100, high=10000, size=len(df))

# Check for missing values
print("Missing values before cleaning:\n", df.isnull().sum())

# Drop rows with missing latitude or longitude
df = df.dropna(subset=['latitude', 'longitude'])

# Convert latitude and longitude to numeric, handle errors
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

# Drop any rows with invalid coordinates
df = df.dropna(subset=['latitude', 'longitude'])
print("Missing values after cleaning:\n", df.isnull().sum())
print("Dataset shape:", df.shape)

# Step 3: Normalize Features
features = ['latitude', 'longitude', 'population_density']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[features])

# Step 4: Determine Optimal Number of Clusters (Elbow Method)
inertia = []
silhouette_scores = []
K = range(2, 11)

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)
    if k >= 2:  # Silhouette score requires at least 2 clusters
        score = silhouette_score(X_scaled, kmeans.labels_)
        silhouette_scores.append(score)

# Plot Elbow Curve
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')

# Plot Silhouette Scores
plt.subplot(1, 2, 2)
plt.plot(K[1:], silhouette_scores[1:], 'bo-')  # Align K[1:] (3-10) with silhouette_scores
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score vs k')
plt.tight_layout()
plt.show()

# Step 5: Train K-means Model (Choose k=5 based on elbow/silhouette analysis)
optimal_k = 5
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

# Step 6: Evaluate Clustering
sil_score = silhouette_score(X_scaled, df['cluster'])
print(f"Silhouette Score for k={optimal_k}: {sil_score:.4f}")

# Step 7: Visualize Clusters on Map
# Create a map centered on NYC
nyc_map = folium.Map(location=[40.7128, -74.0060], zoom_start=11)

# Add clustered points
marker_cluster = MarkerCluster().add_to(nyc_map)
for idx, row in df.iterrows():
    folium.Marker(
        location=[row['latitude'], row['longitude']],
        popup=f"Cluster {row['cluster']}, Pop. Density: {row['population_density']:.0f}",
        icon=folium.Icon(color=['red', 'blue', 'green', 'purple', 'orange'][int(row['cluster'])])
    ).add_to(marker_cluster)

# Save map
nyc_map.save('nyc_bus_stops_map.html')
print("Map saved as 'nyc_bus_stops_map.html'. Open in a browser to view.")

# Step 8: Visualize Clusters in 2D
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='longitude', y='latitude', hue='cluster', size='population_density',
                palette='deep', sizes=(20, 200))
plt.title('Clustered Bus Stop Locations by Population Density')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

# Step 9: Save Results
df.to_csv('clustered_bus_stops.csv', index=False)
print("Clustered data saved to 'clustered_bus_stops.csv'.")

# Step 10: Summary Statistics by Cluster
cluster_summary = df.groupby('cluster').agg({
    'latitude': ['mean', 'count'],
    'longitude': 'mean',
    'population_density': 'mean'
}).round(2)
print("Cluster Summary:\n", cluster_summary)



Dataset columns: []
Missing values before cleaning:
 population_density    0
dtype: int64


KeyError: ['latitude', 'longitude']