# Prerequisite

In [26]:
import pandas as pd
import folium
from scipy.spatial.distance import cdist

# Load datasets into dataframe

In [18]:
bus_stops = pd.read_csv('data/Bus_Stops_Geocoded.csv')
employee_addresses = pd.read_csv('data/Employee_Addresses_Geocoded.csv')

In [19]:
bus_stops.head()

Unnamed: 0.1,Unnamed: 0,Street_One,Street_Two,address,address2,address3,longitude,latitude
0,0,MISSION ST,ITALY AVE,"MISSION ST and ITALY AVE , San Francisco, CA","ITALY AVE and MISSION ST, San Francisco, CA","ITALY AVE , San Francisco, CA",-122.439211,37.718696
1,1,MISSION ST,NEW MONTGOMERY ST,"MISSION ST and NEW MONTGOMERY ST , San Francis...","NEW MONTGOMERY ST and MISSION ST, San Francisc...","NEW MONTGOMERY ST , San Francisco, CA",-122.400821,37.787704
2,2,MISSION ST,01ST ST,"MISSION ST and 01ST ST , San Francisco, CA","01ST ST and MISSION ST, San Francisco, CA","01ST ST , San Francisco, CA",-122.397352,37.78947
3,3,MISSION ST,20TH ST,"MISSION ST and 20TH ST , San Francisco, CA","20TH ST and MISSION ST, San Francisco, CA","20TH ST , San Francisco, CA",-122.418988,37.758812
4,4,MISSION ST,FREMONT ST,"MISSION ST and FREMONT ST , San Francisco, CA","FREMONT ST and MISSION ST, San Francisco, CA","FREMONT ST , San Francisco, CA",-122.396762,37.790306


In [20]:
employee_addresses.head()

Unnamed: 0.1,Unnamed: 0,address,employee_id,longitude,latitude
0,0,"98 Edinburgh St, San Francisco, CA 94112, USA",206,-122.427311,37.727605
1,1,"237 Accacia St, Daly City, CA 94014, USA",2081,-122.415915,37.704391
2,2,"1835 Folsom St, San Francisco, CA 94103, USA",178,-122.415454,37.76795
3,3,"170 Cambridge St, San Francisco, CA 94134, USA",50,-122.419539,37.729672
4,4,"16 Roanoke St, San Francisco, CA 94131, USA",1863,-122.431277,37.736422


# Overview of the datasets

<img src="geocoding.png">

# Clustering addresses (Zoning)

In [21]:
coords = employee_addresses.as_matrix(columns=['latitude', 'longitude']) # should be employee address
kmeans = KMeans(n_clusters=10)
kmeans.fit(coords)

  """Entry point for launching an IPython kernel.


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=10, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [22]:
# Get centroids
employee_addresses_centroids = kmeans.cluster_centers_

In [23]:
employee_addresses_centroids

array([[  37.71230231, -122.44239652],
       [  37.76941676, -122.41325679],
       [  37.79238025, -122.44363024],
       [  37.74088998, -122.4261256 ],
       [  37.73568261, -122.40337384],
       [  37.70984887, -122.41246354],
       [  37.78271404, -122.39831101],
       [  37.74409401, -122.44901411],
       [  37.72818816, -122.42872848],
       [  37.76819869, -122.42986726]])

# Folium map

In [24]:
m = folium.Map(location=[37.75, -122.4], zoom_start=13)

# Employee address centroid markers
for i in range(10):
    folium.Marker([employee_addresses_centroids[i, 0], employee_addresses_centroids[i, 1]]).add_to(m)
    
display(m)

<img src="clustering.png">

# Key optimization metric

In reality, we should use a metric related to walk score or walk distance. Google Map API can provide that. As an alternative, we are using Manhattan distance for now.

In [27]:
A = np.column_stack([bus_stops['longitude'], bus_stops['latitude']])
B = np.column_stack([employee_addresses_centroids[:, 1], employee_addresses_centroids[:, 0]])
distance = cdist(A, B, metric='cityblock')
distance_sum = np.sum(distance, 1)

In [42]:
np.shape(distance)

(119, 10)

# Best 10 stops

The stops sorted by least sum distances are considered as the best 10 stops.

In [40]:
np.shape(distance)
index_10 = np.argmin(distance, 0)

#distance_sort = np.sort(distance_sum)
#index_10 = np.argsort(distance_sum)[0:10]
bus_stops.iloc[index_10, :]

Unnamed: 0.1,Unnamed: 0,Street_One,Street_Two,address,address2,address3,longitude,latitude
67,67,MISSION ST,CONCORD ST,"MISSION ST and CONCORD ST , San Francisco, CA","CONCORD ST and MISSION ST, San Francisco, CA","CONCORD ST , San Francisco, CA",-122.442172,37.710944
69,69,MISSION ST,10TH ST,"MISSION ST and 10TH ST , San Francisco, CA","10TH ST and MISSION ST, San Francisco, CA","10TH ST , San Francisco, CA",-122.408071,37.768955
34,34,MISSION ST,17TH ST,"MISSION ST and 17TH ST , San Francisco, CA","17TH ST and MISSION ST, San Francisco, CA","17TH ST , San Francisco, CA",-122.447702,37.761736
111,111,MISSION ST,BROOK ST,"MISSION ST and BROOK ST , San Francisco, CA","BROOK ST and MISSION ST, San Francisco, CA","BROOK ST , San Francisco, CA",-122.423451,37.740685
51,51,MISSION ST,EUGENIA AVE,"MISSION ST and EUGENIA AVE , San Francisco, CA","EUGENIA AVE and MISSION ST, San Francisco, CA","EUGENIA AVE , San Francisco, CA",-122.415787,37.740412
53,53,MISSION ST,FRANCE AVE,"MISSION ST and FRANCE AVE , San Francisco, CA","FRANCE AVE and MISSION ST, San Francisco, CA","FRANCE AVE , San Francisco, CA",-122.43067,37.71644
73,73,MISSION ST,SHAW ALY,"MISSION ST and SHAW ALY , San Francisco, CA","SHAW ALY and MISSION ST, San Francisco, CA","SHAW ALY , San Francisco, CA",-122.397562,37.78819
85,85,MISSION ST,25TH ST,"MISSION ST and 25TH ST , San Francisco, CA","25TH ST and MISSION ST, San Francisco, CA","25TH ST , San Francisco, CA",-122.440573,37.749297
17,17,MISSION ST,TRUMBULL ST,"MISSION ST and TRUMBULL ST , San Francisco, CA","TRUMBULL ST and MISSION ST, San Francisco, CA","TRUMBULL ST , San Francisco, CA",-122.429318,37.730611
18,18,MISSION ST,15TH ST,"MISSION ST and 15TH ST , San Francisco, CA","15TH ST and MISSION ST, San Francisco, CA","15TH ST , San Francisco, CA",-122.431055,37.766005


# Folium map for best 10 stops

In [44]:
m = folium.Map(location=[37.75, -122.4], zoom_start=13)

# Bus stops markers
for i in range(0,len(bus_stops)):
    folium.Marker([bus_stops.iloc[i]['latitude'], bus_stops.iloc[i]['longitude']]).add_to(m)

# Bus stops markers
for i in range(10):
    folium.Marker([bus_stops.iloc[index_10[i]]['latitude'], bus_stops.iloc[index_10[i]]['longitude']], icon = folium.Icon(color='red')).add_to(m)
    
# Employee address centroid markers
for i in range(10):
    folium.Marker([employee_addresses_centroids[i, 0], employee_addresses_centroids[i, 1]], icon = folium.Icon(color='green')).add_to(m)
    
m.save('results_final.html')
display(m)

<img src="results_final.png">

Blue points are the bus stop points, green points are the clustered employee addresses points, and red points are the selected best 10 stops. The red selected points are deviated from the Mission St due to OpenStreetMap search engine error. Better Map API (e.g. Google Map API) should fix the problem.