In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
import seaborn as sns

from pyproj import Proj, transform

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import DBSCAN

import bokeh
import bokeh.plotting as plotting
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.tile_providers import CARTODBPOSITRON
plotting.output_notebook()

from sklearn import metrics

sns.set_style('whitegrid')

# Problem definition

Cluster regions based on crime data

http://donnees.ville.montreal.qc.ca/dataset/actes-criminels

# Load the data

In [2]:
#input
df = pd.read_csv('data/interventionscitoyendo.csv', encoding='latin_1')
df['DATE'] = pd.to_datetime(df['DATE'])
print(df.columns)
print(df['CATEGORIE'].value_counts())
df.head()

Index(['CATEGORIE', 'DATE', 'QUART', 'PDQ', 'X', 'Y', 'LAT', 'LONG'], dtype='object')
Introduction                        30472
Vol dans / sur véhicule à moteur    24465
Méfait                              20702
Vol de véhicule à moteur            12453
Vols qualifiés                       4913
Infractions entrainant la mort         73
Name: CATEGORIE, dtype: int64


Unnamed: 0,CATEGORIE,DATE,QUART,PDQ,X,Y,LAT,LONG
0,Introduction,2015-01-01,jour,23,302375.197993,5046522.0,45.558606,-73.53106
1,Introduction,2015-01-01,jour,23,302375.197993,5046522.0,45.558606,-73.53106
2,Introduction,2015-01-01,soir,13,295850.656,5031730.0,45.425443,-73.614364
3,Introduction,2015-01-01,nuit,8,289215.072,5036423.0,45.467564,-73.699308
4,Introduction,2015-01-01,soir,44,298915.433995,5046912.0,45.56209,-73.575381


In [3]:
df[(df['X']>0)]

Unnamed: 0,CATEGORIE,DATE,QUART,PDQ,X,Y,LAT,LONG
0,Introduction,2015-01-01,jour,23,302375.197993,5.046522e+06,45.558606,-73.531060
1,Introduction,2015-01-01,jour,23,302375.197993,5.046522e+06,45.558606,-73.531060
2,Introduction,2015-01-01,soir,13,295850.656000,5.031730e+06,45.425443,-73.614364
3,Introduction,2015-01-01,nuit,8,289215.072000,5.036423e+06,45.467564,-73.699308
4,Introduction,2015-01-01,soir,44,298915.433995,5.046912e+06,45.562090,-73.575381
5,Introduction,2015-01-01,soir,15,297494.204005,5.034926e+06,45.454221,-73.593408
6,Introduction,2015-01-01,jour,3,276368.349000,5.041011e+06,45.508444,-73.863862
7,Introduction,2015-01-01,jour,15,299192.515999,5.038122e+06,45.482999,-73.571731
8,Introduction,2015-01-01,jour,42,297210.375006,5.050180e+06,45.591480,-73.597273
9,Introduction,2015-01-01,jour,49,305327.121993,5.059317e+06,45.673742,-73.493234


# Feature Engineering 

In [4]:
# feature engineering

# select a period
df = df[df['DATE']>='2017-01-01']

# select the categories
df = df[df['CATEGORIE']==u'Vols qualifiés']

# remove lines with no location
df = df[(df['X']>0)&(df['Y']>0)]

# adapt X and Y to the visualization
df['X'] = df.apply(lambda x: transform(Proj(init='epsg:4326'), 
                                       Proj(init='epsg:3857'), x['LONG'], x['LAT'])[1], axis=1)
df['Y'] = df.apply(lambda x: transform(Proj(init='epsg:4326'), 
                                       Proj(init='epsg:3857'), x['LONG'], x['LAT'])[0], axis=1)

X_columns = ['X', 'Y']
df = df[X_columns]

# Model Training

In [5]:
model = DBSCAN(eps=3.0, min_samples=4)
model.fit(df[['X', 'Y']])

db=model.fit(df[['X', 'Y']])
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)

cluster_labels = model.labels_
n_clusters = len(set(cluster_labels))
print(collections.Counter(cluster_labels))

df['cluster'] = cluster_labels

Estimated number of clusters: 65
Counter({-1: 1360, 14: 17, 30: 14, 3: 11, 29: 11, 58: 10, 11: 9, 21: 9, 24: 9, 26: 9, 5: 8, 13: 8, 2: 7, 4: 7, 7: 7, 15: 7, 33: 7, 45: 7, 60: 7, 64: 7, 1: 6, 18: 6, 32: 6, 36: 6, 37: 6, 39: 6, 41: 6, 44: 6, 52: 6, 9: 5, 10: 5, 17: 5, 22: 5, 27: 5, 28: 5, 31: 5, 35: 5, 42: 5, 43: 5, 46: 5, 54: 5, 0: 4, 6: 4, 8: 4, 12: 4, 16: 4, 19: 4, 20: 4, 23: 4, 25: 4, 34: 4, 38: 4, 40: 4, 47: 4, 48: 4, 49: 4, 50: 4, 51: 4, 53: 4, 55: 4, 56: 4, 57: 4, 59: 4, 61: 4, 62: 4, 63: 4})


In [6]:
df

Unnamed: 0,X,Y,cluster
59270,5.702623e+06,-8.202536e+06,0
59271,5.694535e+06,-8.189453e+06,1
59272,5.702644e+06,-8.189830e+06,2
59273,5.704330e+06,-8.188761e+06,-1
59274,5.700377e+06,-8.190031e+06,-1
59275,5.705695e+06,-8.190365e+06,-1
59276,5.699601e+06,-8.190806e+06,-1
59277,5.706918e+06,-8.188183e+06,-1
59342,5.703881e+06,-8.188084e+06,-1
59343,5.704226e+06,-8.188807e+06,-1


In [7]:
p = figure(y_range=(5641788.0, 5751788.0), x_range=(-8152883, -8252883))
p.add_tile(CARTODBPOSITRON)

latitude  = list(df[df['cluster']>-1]['X'].values)
longitude = list(df[df['cluster']>-1]['Y'].values)

colormap = list(bokeh.palettes.viridis(n_clusters))
colors = [colormap[x] for x in df[df['cluster']>-1]['cluster']]
source = ColumnDataSource(data=dict(longitude=longitude, latitude=latitude))
p.circle(x=longitude, y=latitude, color=colors, fill_alpha=0.2, size=5)
show(p)

# Model Evaluation

In [8]:
# Inter-Cluster
centroids = []
for cluster in sorted(set(model.labels_)):
    centroids.append(df[df['cluster']==cluster][X_columns].mean().values)
distances = []
for c1 in centroids:
    for c2 in centroids:
        distances.append(euclidean_distances(c1.reshape(-1, 1), c2.reshape(-1, 1))[0][0])
print('Inter Cluster distance', np.mean(distances))

# Intra-Cluster
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df[df['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(-1, 1), v.values.reshape(-1, 1))[0][0])
print('Intra Cluster distance', np.mean(distances))

# Inertia
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df[df['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(1, -1), v.values.reshape(1, -1), squared=True)[0][0])
print('Inertia', np.sum(distances))

Inter Cluster distance 8624.712333222149
Intra Cluster distance 5196.4774503545705
Inertia 183375163125.64062


In [9]:
print("Silhouette Coef: %0.3f" % metrics.silhouette_score(df[['X', 'Y']],labels))

Silhouette Coef: -0.478
