Generate 1 random point for each 100 jobs in every block group, and apply HDBSCAN to identify clusters in each city. Then select largest/most central cluster in each city to be downtown. Do this for US and Canada.

### Install/load packages

In [None]:
# pip install folium matplotlib mapclassify 
# import sys
# !{sys.executable} -m pip install contextily

In [None]:
import pandas as pd
import geopandas as gpd
import folium
import matplotlib
import mapclassify
import numpy as np
import shapely
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import hdbscan
import contextily as cx

### Combine US and Canada data

In [None]:
data_path = "C:/Users/jpg23/data/downtownrecovery/lehd_new_downtowns/"

In [None]:
# canada = gpd.read_file(data_path + "canada_DA_jobs.geojson") # Canada
# us_bg = gpd.read_file(data_path + "cities_lehd_jobs_blockgroup.geojson") # US - block group

In [None]:
# canada.head()

In [None]:
# canada_sf = gpd.read_file(data_path + "reprojected_canada_DA.geojson")[['DAUID', 'geometry']]

In [None]:
# canada_sf.head()

In [None]:
# canada.id.nunique()

In [None]:
# canada_sf.DAUID.nunique()

In [None]:
# canada_newgeom = canada_sf.merge(canada.drop('geometry', axis=1), left_on='DAUID', right_on='id', how='inner').drop(columns = ['DAUID'])

In [None]:
# canada_newgeom.head()

In [None]:
# canada_newgeom.id.nunique()

In [None]:
# us_bg.head()

In [None]:
# canada_newgeom.crs

In [None]:
# us_bg.crs

In [None]:
# # reproject so they're in the same CRS
# us_bg_reproj = us_bg.to_crs('EPSG:4326')

In [None]:
# us_bg_reproj.crs

In [None]:
# us_bg_reproj.crs == canada_newgeom.crs

In [None]:
# # gut check - are jobs_per_sq_meter relatievly similar for US & Canada?
# print(canada_newgeom.jobs_per_sq_meter.mean())
# us_bg_reproj.jobs_per_sq_meter.mean()

In [None]:
# print(canada_newgeom.jobs_per_sq_meter.median())
# us_bg_reproj.jobs_per_sq_meter.median()

In [None]:
# # stack the datasets
# canada_final = canada_newgeom.rename(columns={"CMANAME": "place"})[['id', 'total_jobs', 'jobs_per_sq_meter', 'place', 'geometry']]

# us_bg_final = us_bg_reproj.rename(columns={"block_group":"id", "city":"place"})[['id', 'total_jobs', 'jobs_per_sq_meter', 'place', 'geometry']]

In [None]:
# df = pd.concat([us_bg_final, canada_final])

In [None]:
# df.head()

In [None]:
# df.place.unique()

In [None]:
# df.crs

In [None]:
# # Drop the Quebec part of Ottawa
# df = df[df['place']!='Ottawa - Gatineau (partie du Québec / Quebec part)']

In [None]:
# df.place.unique()

In [None]:
# type(df)

In [None]:
# df.plot()

In [None]:
# # Look at Tampa as an example
# tampa = df[df['place']=='Tampa FL']

In [None]:
# tampa.explore(
#     tooltip="id",
#     tiles="CartoDB positron",
#     style_kwds=dict(color="black")
# )

### In each block group/dissemination area, randomly scatter 1 point for every 100 jobs

In [None]:
# # scatter random points in polygon

# def Random_Points_in_Bounds(polygon, number):   
#     minx, miny, maxx, maxy = polygon.bounds
#     x = np.random.uniform(minx, maxx, number*4)
#     y = np.random.uniform(miny, maxy, number*4)
#     gdf_poly = gpd.GeoDataFrame(index=["myPoly"], geometry=[polygon])
#     df = pd.DataFrame()
#     df['points'] = list(zip(x,y))
#     df['points'] = df['points'].apply(shapely.geometry.Point)
#     gdf_points = gpd.GeoDataFrame(df, geometry='points')
#     Sjoin = gpd.sjoin(gdf_points, gdf_poly, op="within", how='left')
#     pnts_in_poly = gdf_points[Sjoin.index_right=='myPoly']
#     return pnts_in_poly['points'].tolist()[0:number]

In [None]:
# # divide # of total jobs by 100
# df['jobs_hundreds'] = round((df['total_jobs']/100), 0)

# # filter to only block groups / dissemination areas where there are at least 100 jobs
# df_100 = df[df['jobs_hundreds']>0].copy()

# # create randomly scattered points in each block group / dissemination area
# df_100['points'] = df_100[['geometry', 'jobs_hundreds']].apply(lambda x: Random_Points_in_Bounds(x[0], int(x[1])), axis=1)

In [None]:
# df_100.head()

In [None]:
# # create a row for each set of points
# df_100_points = df_100.explode('points')

In [None]:
# df_100_points.head()

In [None]:
# type(df_100_points)

In [None]:
# # initialize format for dbscan
# df_dbscan = gpd.GeoDataFrame(df_100_points[['id', 'points', 'jobs_per_sq_meter', 'place']].copy(), geometry='points')
# df_dbscan['point_lon'] = df_dbscan['points'].x
# df_dbscan['point_lat'] = df_dbscan['points'].y

In [None]:
# df_dbscan.head()

In [None]:
# df_dbscan.to_file(data_path + 'blockgroup_random_pts.geojson', driver='GeoJSON')  

In [None]:
df_dbscan = gpd.read_file(data_path + "blockgroup_random_pts.geojson")

### Apply HDBSCAN to determine clusters in each city

See [hdbscan documentation](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.HDBSCAN.html) and [documentation on parameter selection](https://hdbscan.readthedocs.io/en/latest/parameter_selection.html).

In [None]:
df_dbscan_notnull = df_dbscan[df_dbscan['point_lon'].notna()]

In [None]:
# what % of rows were not NA?
df_dbscan_notnull.shape[0]/df_dbscan.shape[0]

In [None]:
# standardize lat/long values
X = StandardScaler().fit_transform(df_dbscan_notnull[['point_lon', 'point_lat']].copy())

In [None]:
type(X)

In [None]:
X

In [None]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=25,
                           core_dist_n_jobs=1)

In [None]:
clusterer.fit(X)

In [None]:
# find number of labels

np.max(clusterer.labels_)

In [None]:
# append results to original database

df_dbscan_notnull['cluster'] = clusterer.labels_
df_dbscan_notnull.head()

In [None]:
df_dbscan_notnull.cluster.unique()

In [None]:
df_dbscan_notnull.head()

### Explore points in specific cities

In [None]:
# create map of all points in city
def explore_pts(city_name):
    pts_only_city = df_dbscan_notnull[df_dbscan_notnull['place']==city_name].set_crs(4326)
    
#     ax = pts_only_city.plot(figsize=(9, 9), alpha=0.5)
#     cx.add_basemap(ax, source=cx.providers.CartoDB.Positron, crs=4326)

    return(pts_only_city.explore(
        tiles="CartoDB positron",
        style_kwds=dict(opacity=.5, fillOpacity=.5)
    ))

In [None]:
explore_pts('San Francisco CA')

In [None]:
explore_pts('Nashville-Davidson metropolitan government (balance) TN')

In [None]:
explore_pts('Portland OR')

### Explore clusters in specific cities

In [None]:
# Filter out non-clustered points and set CRS
clusters_only = df_dbscan_notnull[df_dbscan_notnull["cluster"]!=-1].set_crs(4326)
clusters_only['cluster'] = clusters_only['cluster'].astype(str)

In [None]:
# create interactive map of clusters
# see https://matplotlib.org/stable/users/explain/colors/colormaps.html for cmap options

def explore_clusters(city_name):
    
    clusters_only_city = clusters_only[clusters_only['place']==city_name]
    
    return(clusters_only_city.explore(
        column="cluster",
        tiles="CartoDB positron",
        style_kwds=dict(opacity=.5, fillOpacity=.5),
        cmap='Spectral'
    ))

In [None]:
explore_clusters('San Francisco CA')

In [None]:
explore_clusters('Toronto')

In [None]:
explore_clusters('Calgary')

In [None]:
explore_clusters('Albuquerque NM')

In [None]:
explore_clusters('New York NY')

In [None]:
explore_clusters('Philadelphia PA')

In [None]:
explore_clusters('Nashville-Davidson metropolitan government (balance) TN')

### Tune HDBSCAN parameters

See [this link](https://hdbscan.readthedocs.io/en/latest/parameter_selection.html) and [this Stack Overflow post](https://stackoverflow.com/questions/67898039/hdbscan-difference-between-parameters) for a helpful explanation of how the parameters work.

In [None]:
def create_clusters_tuned(which_min_cluster_size, which_min_samples, which_cluster_selection_epsilon):
    
    clusterer_tuned = hdbscan.HDBSCAN(min_cluster_size = which_min_cluster_size,
                                      min_samples = which_min_samples,
                                      cluster_selection_epsilon = which_cluster_selection_epsilon,
                                      core_dist_n_jobs=1)
    
    clusterer_tuned.fit(X)
    
    # append results to original database
    df_dbscan_notnull_tuned = df_dbscan_notnull
    df_dbscan_notnull_tuned['cluster'] = clusterer_tuned.labels_
    df_dbscan_notnull_tuned.head()

    # Filter out non-clustered points and set CRS
    clusters_only_tuned = df_dbscan_notnull_tuned[df_dbscan_notnull_tuned["cluster"]!=-1].set_crs(4326)
    clusters_only_tuned['cluster'] = clusters_only_tuned['cluster'].astype(str)
    
    return(clusters_only_tuned)

In [None]:
# create interactive map of clusters
# see https://matplotlib.org/stable/users/explain/colors/colormaps.html for cmap options

def explore_clusters_tuned(tuned_df, city_name):
    
    df = tuned_df[tuned_df['place']==city_name]
    
    return(df.explore(
        column="cluster",
        tiles="CartoDB positron",
        style_kwds=dict(opacity=.5, fillOpacity=.5),
        cmap='Spectral'
    ))

In [None]:
df_100_25_0 = create_clusters_tuned(which_min_cluster_size = 100, 
                                    which_min_samples = 25, 
                                    which_cluster_selection_epsilon = 0)

In [None]:
explore_clusters_tuned(df_100_25_0, 'San Francisco CA')

In [None]:
df_100_100_0 = create_clusters_tuned(which_min_cluster_size = 100, 
                                     which_min_samples = 100, 
                                     which_cluster_selection_epsilon = 0)

In [None]:
explore_clusters_tuned(df_100_100_0, 'San Francisco CA')

In [None]:
df_100_100_5 = create_clusters_tuned(which_min_cluster_size = 100, 
                                     which_min_samples = 100, 
                                     which_cluster_selection_epsilon = .00005)

In [None]:
explore_clusters_tuned(df_100_100_5, 'San Francisco CA')

In [None]:
df_50_50_5 = create_clusters_tuned(which_min_cluster_size = 50, 
                                   which_min_samples = 50, 
                                   which_cluster_selection_epsilon = .00005)

In [None]:
explore_clusters_tuned(df_50_50_5, 'San Francisco CA')

### Choose largest/most central cluster in each city for downtown

Map block groups/dissemination areas to clusters. (HOW???)

In [None]:
# clusters_only['count'] = 1
# grouped_clust = clusters_only.groupby(['id','cluster'])['count'].sum().reset_index()

In [None]:
# # Merge the count data back into the original DataFrame
# merged_df = pd.merge(clusters_only.drop(columns=['count']), grouped_clust, on=['id', 'cluster'], how='left')
# merged_df.head()

In [None]:
# # are there IDs with multiple clusters?
# multiple_clust = merged_df.join(merged_df.groupby('id')['cluster'].nunique(), on='id', rsuffix='_r').sort_values(by=['cluster_r', 'id'], ascending=False)
# multiple_clust.head()

In [None]:
# # look at one in the US
# multiple_clust[multiple_clust['place'].str.contains('\s\w{2}$')].head()

REMEMBER TO SELECT TWO IN TORONTO CMA -- ONE FOR TORONTO AND ONE FOR MISSISSAUGA!

### Combine all downtown polygons and export as a single shapefile