In [1]:
# import the library
%matplotlib inline
import pandas as pd
import numpy as np
import collections

from pyproj import Proj, transform
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances

import bokeh
from bokeh.plotting import figure, show, ColumnDataSource, output_notebook
from bokeh.tile_providers import get_provider, Vendors


In [2]:
#Manage data
#read the file
df = pd.read_csv('Bixi_data/Stations/Stations_2019.csv', encoding='latin_1')
print(df.columns)

df.apply(lambda x: pd.to_numeric(x['longitude'], errors='ignore'), axis = 1)
print(df.head())
df.dtypes


Index(['Code', 'name', 'latitude', 'longitude'], dtype='object')
    Code                                     name   latitude  longitude
0  10002  MÃ©tro Charlevoix (Centre / Charlevoix)  45.478228 -73.569651
1   4000                   Jeanne-d'Arc / Ontario  45.549598 -73.541874
2   4001                      Graham / Brookfield  45.520075 -73.629776
3   4002                       Graham / Wicksteed  45.516937 -73.640483
4   5002                 St-Charles / Montarville  45.533682 -73.515261


Code           int64
name          object
latitude     float64
longitude    float64
dtype: object

In [3]:
#Conversion

#Converts from longitude,latitude to native map projection x,y coordinates
# adapt X and Y to the visualization
inProj = Proj(init='epsg:3857')
outProj = Proj(init='epsg:4326')

#make new dataframe with X and Y columns and conversion
df['X'] = df.apply(lambda x: transform(outProj, inProj, x['longitude'], x['latitude'])[1], axis=1)
df['Y'] = df.apply(lambda x: transform(outProj, inProj, x['longitude'], x['latitude'])[0], axis=1)

X_columns = ['X', 'Y']
df = df[X_columns]

df.head()


  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))
  return _prepare_from_string(" ".join(pjargs))
  projstring = _prepare_from_string(" ".join((projstring, projkwargs)))


Unnamed: 0,X,Y
0,5697126.0,-8189736.0
1,5708463.0,-8186644.0
2,5703772.0,-8196429.0
3,5703273.0,-8197621.0
4,5705934.0,-8183681.0


In [4]:
#Training Data

#number of clusters
#create cluster column
k = 10
model = KMeans(n_clusters=k).fit(df.values)

print(set(model.labels_))
print(collections.Counter(model.labels_))

df['cluster'] = model.labels_
print(df['cluster'].head())



{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
Counter({1: 122, 3: 113, 6: 106, 8: 84, 4: 69, 0: 38, 9: 37, 5: 23, 2: 19, 7: 8})
0    3
1    4
2    8
3    0
4    6
Name: cluster, dtype: int32


In [5]:
#plotting the latitude and longitude coordinates on geographical map

k = 10
p = figure(y_range=(5641788.0, 5751788.0), x_range=(-8152883, -8252883))
p.add_tile(get_provider(Vendors.CARTODBPOSITRON_RETINA))

#take the coordiantes as list
#get the colors palettes for the positive number of clusters
#plots the colors
latitude  = list(df[df['cluster']>-1]['X'].values)
longitude = list(df[df['cluster']>-1]['Y'].values)
colormap = list(bokeh.palettes.plasma(k))
colors = [colormap[x] for x in df[df['cluster']>-1]['cluster']]
p.circle(x=longitude, y=latitude, color=colors, fill_alpha=0.2, size=7)
output_notebook()
show(p)

In [6]:
# Model Evaluation

# Inter-Cluster
centroids = []

#checks if the cluster exist in dataframe
#get the avg for the x and y coordinates seperately
#add the coordiantes to centroid array
#calculate the euclidean distance between each centroid (different cluster)
for cluster in sorted(set(model.labels_)):
    centroids.append(df[df['cluster']==cluster][X_columns].mean().values)
distances = []
for c1 in centroids:
    for c2 in centroids:
        distances.append(euclidean_distances(c1.reshape(-1, 1), c2.reshape(-1, 1))[0][0])

#average of the distances
print('Inter Cluster distance', np.mean(distances))

# Intra-Cluster
#filters through the cluster
#calculate distance between the centroid and all the other coordiantes (same cluster)
distances = []
for cluster in sorted(set(model.labels_)):
    filter = df[df['cluster']==cluster]
    centroid = filter[X_columns].mean().values
    for k, v in filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(-1, 1), v.values.reshape(-1, 1))[0][0])
        
#average of the distances
print('Intra Cluster distance', np.mean(distances))

Inter Cluster distance 8650.307489751856
Intra Cluster distance 1417.682426089967
