In [11]:
import pandas as pd 
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
import folium
from geojson import Feature, FeatureCollection, Point
import json
from scipy.spatial import ConvexHull, convex_hull_plot_2d
from folium import IFrame
from pyproj import Proj
from shapely.geometry import Polygon

In [12]:
# save them to csvs
bikeData = pd.read_csv("data/bikes.csv")
streets = pd.read_csv("data/streets.csv")
streetSegs = pd.read_csv("data/streetSegs.csv")

In [13]:
def runCluster(dataset, eps, minSamples):
    accPoints = np.empty((0, 2))   
    
    for lat,long in zip(dataset['LATITUDE'], dataset['LONGITUDE']):
        accPoints = np.append(accPoints, np.array([[lat,long]]), axis=0)

    # Compute DBSCAN
    db = DBSCAN(eps=eps, min_samples=minSamples).fit(accPoints)

    clusters = db.fit_predict(accPoints)
    dataset['CLUSTER'] = clusters
    

    clusters = dataset['CLUSTER'].unique()   
    clusterDict = {}
    primaryStreetDict = {}
    secondaryStreetDict = {}
    primaryFacilityTypeDict = {}
    secondaryFacilityTypeDict = {}
    primaryStreetSegIdDict = {}
    secondaryStreetSegIdDict = {}
    
    for i in clusters:
        clusterSet = dataset[(dataset["CLUSTER"] == i) & (dataset["STREETSEGID"] > 0)]
        valueCounts = clusterSet['STREETSEGID'].value_counts()
        if (len(valueCounts.index) > 1):
            primaryValueId = valueCounts.index[0]
            primaryValue = streetSegs[streetSegs["STREETSEGID"] == int(primaryValueId)].reset_index()["REGISTEREDNAME"][0]
            secondaryValueId = valueCounts.index[1]
            secondaryValue = streetSegs[streetSegs["STREETSEGID"] == int(secondaryValueId)].reset_index()["REGISTEREDNAME"][0]
            primaryStreetDict[i] = primaryValue
            secondaryStreetDict[i] = secondaryValue
            
            facilityResp = streets[streets["STREETSEGID"] == int(primaryValueId)]
            if (len(facilityResp) > 0):    
                primaryFacilityTypeDict[i] = facilityResp.reset_index()["FACILITY"][0]
                primaryStreetSegIdDict[i] = primaryValueId
            else:
                primaryFacilityTypeDict[i] = "None"
                primaryStreetSegIdDict[i] = "None"
            facilityResp = streets[streets["STREETSEGID"] == int(secondaryValueId)]
            if (len(facilityResp) > 0):  
                secondaryFacilityTypeDict[i] = facilityResp.reset_index()["FACILITY"][0]
                secondaryStreetSegIdDict[i] = secondaryValueId
            else:
                secondaryFacilityTypeDict[i] = "None"
                secondaryStreetSegIdDict[i] = "None"
        elif (len(valueCounts.index) > 0):
            primaryValueId = valueCounts.index[0]
            primaryValue = streetSegs[streetSegs["STREETSEGID"] == int(primaryValueId)].reset_index()["REGISTEREDNAME"][0]
            primaryStreetDict[i] = primaryValue
            facilityResp = streets[streets["STREETSEGID"] == int(primaryValueId)]
            if (len(facilityResp) > 0):    
                primaryFacilityTypeDict[i] = facilityResp.reset_index()["FACILITY"][0]
                primaryStreetSegIdDict[i] = primaryValueId
            else:
                primaryFacilityTypeDict[i] = "None"
                primaryStreetSegIdDict[i] = "None"
                
            secondaryStreetDict[i] = "None"
            secondaryFacilityTypeDict[i] = "None"
            secondaryStreetSegIdDict[i] = "None"
        else:
            primaryStreetDict[i] = "None"
            secondaryStreetDict[i] = "None"
            primaryFacilityTypeDict[i] = "None"
            secondaryFacilityTypeDict[i] = "None"
            primaryStreetSegIdDict[i] = "None"
            secondaryStreetSegIdDict[i] = "None"
            
    primaryStreets = []
    secondaryStreets = []
    primaryFacilityTypes = []
    secondaryFacilityTypes = []
    primaryStreetSegIds = []
    secondaryStreetSegIds = []

    for i in dataset["CLUSTER"]:
        if (primaryStreetDict[i] == "None"):
            primaryStreets.append("None")
            primaryFacilityTypes.append("None")
            primaryStreetSegIds.append("None")
        else:
            primaryStreets.append(primaryStreetDict[i])
            primaryFacilityTypes.append(primaryFacilityTypeDict[i])
            primaryStreetSegIds.append(primaryStreetSegIdDict[i])
        if (secondaryStreetDict[i] == "None"):
            secondaryStreets.append("None")
            secondaryFacilityTypes.append("None")
            secondaryStreetSegIds.append("None")
        else:
            secondaryStreets.append(secondaryStreetDict[i])
            secondaryFacilityTypes.append(secondaryFacilityTypeDict[i])
            secondaryStreetSegIds.append(secondaryStreetSegIdDict[i])
       
    dataset['CLUSTERPRIMARYSTREET'] = primaryStreets
    dataset['CLUSTERSECONDARYSTREET'] = secondaryStreets
    dataset['CLUSTERprimaryFACILITYTYPE'] = primaryFacilityTypes
    dataset['CLUSTERSECONDARYFACILITYTYPE'] = secondaryFacilityTypes
    dataset['CLUSTERPRIMARYSTREETSEGID'] = primaryStreetSegIds
    dataset['CLUSTERSECONDARYSTREETSEGID'] = secondaryStreetSegIds
    return dataset

In [14]:
bikeData.head()

Unnamed: 0,TOTAL_BICYCLES,TOTAL_VEHICLES,TOTAL_PEDESTRIANS,LATITUDE,LONGITUDE,XCOORD,YCOORD,FATAL_BICYCLIST,MPDLATITUDE,MPDLONGITUDE,FROMDATE,STREETSEGID,ROUTEID,NEARESTINTSTREETNAME,NEARESTINTROUTEID,OFFINTERSECTION
0,1,1,0,38.909638,-77.043438,396232.503,137976.022,0,38.909402,-77.042662,2017-04-26 04:00:00+00:00,164.0,11059602,DUPONT CIR NW,11031202,3.74
1,1,1,0,38.900315,-77.021915,398099.004,136940.459,0,38.900315,-77.021947,2017-09-04 04:00:00+00:00,12836.0,11000702,H ST NW,11042442,55.71
2,1,1,0,38.920942,-77.022284,398067.533,139230.192,0,38.920769,-77.022258,2017-06-17 04:00:00+00:00,13081.0,11040042,BARRY PL NW,11013652,19.46
3,1,1,0,38.926518,-77.032418,397188.99,139849.461,0,38.92652,-77.032383,2017-10-25 04:00:00+00:00,3720.0,11001402,HARVARD ST NW,11043742,4.6
4,1,1,0,38.962703,-77.0164,398578.642,143865.924,0,38.962615,-77.016162,2019-06-04 04:00:00+00:00,8198.0,11000302,PEABODY ST NW,11069682,9.71


In [15]:
clusteredData = runCluster(bikeData, 0.0015, 25)

In [16]:
clusteredData.head()

Unnamed: 0,TOTAL_BICYCLES,TOTAL_VEHICLES,TOTAL_PEDESTRIANS,LATITUDE,LONGITUDE,XCOORD,YCOORD,FATAL_BICYCLIST,MPDLATITUDE,MPDLONGITUDE,...,NEARESTINTSTREETNAME,NEARESTINTROUTEID,OFFINTERSECTION,CLUSTER,CLUSTERPRIMARYSTREET,CLUSTERSECONDARYSTREET,CLUSTERprimaryFACILITYTYPE,CLUSTERSECONDARYFACILITYTYPE,CLUSTERPRIMARYSTREETSEGID,CLUSTERSECONDARYSTREETSEGID
0,1,1,0,38.909638,-77.043438,396232.503,137976.022,0,38.909402,-77.042662,...,DUPONT CIR NW,11031202,3.74,-1,14TH,29TH,Existing Bike Lane,,763.0,
1,1,1,0,38.900315,-77.021915,398099.004,136940.459,0,38.900315,-77.021947,...,H ST NW,11042442,55.71,2,7TH,H,Existing Bike Lane,,12493.0,
2,1,1,0,38.920942,-77.022284,398067.533,139230.192,0,38.920769,-77.022258,...,BARRY PL NW,11013652,19.46,-1,14TH,29TH,Existing Bike Lane,,763.0,
3,1,1,0,38.926518,-77.032418,397188.99,139849.461,0,38.92652,-77.032383,...,HARVARD ST NW,11043742,4.6,-1,14TH,29TH,Existing Bike Lane,,763.0,
4,1,1,0,38.962703,-77.0164,398578.642,143865.924,0,38.962615,-77.016162,...,PEABODY ST NW,11069682,9.71,-1,14TH,29TH,Existing Bike Lane,,763.0,
