In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
import seaborn as sns

from pyproj import Proj, transform

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import DBSCAN

import bokeh
import bokeh.plotting as plotting
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.tile_providers import CARTODBPOSITRON
plotting.output_notebook()

sns.set_style('whitegrid')



# Problem definition

Cluster regions based on exterminations

# Load the data

In [2]:
# Load the data
df_base = pd.read_csv('declarations-exterminations-punaises-de-lit-1.csv')

In [3]:
print(df_base.columns)
print('')
print(df_base.dtypes)

Index(['NO_DECLARATION', 'DATE_DECLARATION', 'DATE_PRIOR_INSP', 'EXT_FREQ',
       'DATE_FIRST_EXT', 'DATE_LAST_EXT', 'HOOD_NUM', 'HOOD_NAME', 'BORO_NAME',
       'MTM8_X', 'MTM8_Y', 'LONGITUDE', 'LATITUDE', 'LONG_LAT', 'MTM_X_Y',
       'DEC_MONTH', 'DEC_ISSUE', 'DATE_DIFF'],
      dtype='object')

NO_DECLARATION        int64
DATE_DECLARATION     object
DATE_PRIOR_INSP      object
EXT_FREQ            float64
DATE_FIRST_EXT       object
DATE_LAST_EXT        object
HOOD_NUM             object
HOOD_NAME            object
BORO_NAME            object
MTM8_X              float64
MTM8_Y              float64
LONGITUDE           float64
LATITUDE            float64
LONG_LAT             object
MTM_X_Y              object
DEC_MONTH             int64
DEC_ISSUE             int64
DATE_DIFF           float64
dtype: object


In [4]:
df_base['DATE_DECLARATION'] = pd.to_datetime(df_base['DATE_DECLARATION'])

# Feature Engineering

In [6]:
# Ref.: https://github.com/arybressane/CEBD1260-BIG-DATA-ANALYTICS/blob/master/week7/clustering-dbscan-map.ipynb

# select a period
df = df_base[df_base['DATE_DECLARATION']>='2018-12-01']

# # remove lines with no location
df = df[(df['MTM8_X']>0)&(df['MTM8_Y']>0)]

# adapt X and Y to the visualization
df['X'] = df.apply(lambda x: transform(Proj(init='epsg:4326'),
                                       Proj(init='epsg:3857'), x['LONGITUDE'], x['LATITUDE'])[1], axis=1)

df['Y'] = df.apply(lambda x: transform(Proj(init='epsg:4326'), 
                                       Proj(init='epsg:3857'), x['LONGITUDE'], x['LATITUDE'])[0], axis=1)

X_columns = ['X', 'Y']
df = df[X_columns]

In [7]:
df

Unnamed: 0,X,Y
457,5.706864e+06,-8.186702e+06
458,5.698034e+06,-8.190617e+06
884,5.707005e+06,-8.196105e+06
1373,5.707575e+06,-8.194833e+06
1376,5.711786e+06,-8.187659e+06
1380,5.692014e+06,-8.202311e+06
1386,5.707454e+06,-8.195545e+06
1854,5.718054e+06,-8.185781e+06
1861,5.699145e+06,-8.191181e+06
1863,5.728003e+06,-8.181268e+06


# Model Training

In [8]:
model = DBSCAN(eps=1.0, min_samples=100)
model.fit(df[['X', 'Y']])

cluster_labels = model.labels_
n_clusters = len(set(cluster_labels))
print(collections.Counter(cluster_labels))

df['cluster'] = cluster_labels

Counter({-1: 277})


In [9]:
df

Unnamed: 0,X,Y,cluster
457,5.706864e+06,-8.186702e+06,-1
458,5.698034e+06,-8.190617e+06,-1
884,5.707005e+06,-8.196105e+06,-1
1373,5.707575e+06,-8.194833e+06,-1
1376,5.711786e+06,-8.187659e+06,-1
1380,5.692014e+06,-8.202311e+06,-1
1386,5.707454e+06,-8.195545e+06,-1
1854,5.718054e+06,-8.185781e+06,-1
1861,5.699145e+06,-8.191181e+06,-1
1863,5.728003e+06,-8.181268e+06,-1


In [10]:
p = figure(y_range=(5641788.0, 5751788.0), x_range=(-8152883, -8252883))
p.add_tile(CARTODBPOSITRON)

latitude  = list(df[df['cluster']>-1]['X'].values)
longitude = list(df[df['cluster']>-1]['Y'].values)

colormap = list(bokeh.palettes.viridis(n_clusters))
colors = [colormap[x] for x in df[df['cluster']>-1]['cluster']]
source = ColumnDataSource(data=dict(longitude=longitude, latitude=latitude))
p.circle(x=longitude, y=latitude, color=colors, fill_alpha=0.2, size=5)
show(p)

# Model Evaluation

In [11]:
# Inter-Cluster
centroids = []
for cluster in sorted(set(model.labels_)):
    centroids.append(df[df['cluster']==cluster][X_columns].mean().values)
distances = []
for c1 in centroids:
    for c2 in centroids:
        distances.append(euclidean_distances(c1.reshape(-1, 1), c2.reshape(-1, 1))[0][0])
print('Inter Cluster distance', np.mean(distances))

# Intra-Cluster
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df[df['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(-1, 1), v.values.reshape(-1, 1))[0][0])
print('Intra Cluster distance', np.mean(distances))

# Inertia
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df[df['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(1, -1), v.values.reshape(1, -1), squared=True)[0][0])
print('Inertia', np.sum(distances))



Inter Cluster distance 0.0
Intra Cluster distance 5096.625891149231
Inertia 20742637496.046875


# Describe clusters

Describe your clusters (write a description/summary for each one of the clusters on the notebook)