In [1]:
# import the library
%matplotlib inline

import pandas as pd
import numpy as np
import collections
import matplotlib.pyplot as plt
import seaborn as sns

from pyproj import Proj, transform

from sklearn.metrics.pairwise import euclidean_distances
from sklearn.cluster import DBSCAN

from sklearn import metrics

import bokeh
import bokeh.plotting as plotting
from bokeh.plotting import figure, show, ColumnDataSource
from bokeh.tile_providers import CARTODBPOSITRON
plotting.output_notebook()

sns.set_style('whitegrid')

# For distrance calculations between long and lat intersections
import math # from haversine import haversine
# Ref.:https://janakiev.com/blog/gps-points-distance-python/

# Problem definition

Cluster regions based on exterminations

# Load the data

In [2]:
# Load the data
df_base = pd.read_csv('declarations-exterminations-punaises-de-lit-1.csv')

In [3]:
print(df_base.columns)
print('')
print(df_base.dtypes)

Index(['NO_DECLARATION', 'DATE_DECLARATION', 'DATE_PRIOR_INSP', 'EXT_FREQ',
       'DATE_FIRST_EXT', 'DATE_LAST_EXT', 'HOOD_NUM', 'HOOD_NAME', 'BORO_NAME',
       'MTM8_X', 'MTM8_Y', 'LONGITUDE', 'LATITUDE', 'LONG_LAT', 'MTM_X_Y',
       'DEC_MONTH', 'DEC_ISSUE', 'DATE_DIFF'],
      dtype='object')

NO_DECLARATION        int64
DATE_DECLARATION     object
DATE_PRIOR_INSP      object
EXT_FREQ            float64
DATE_FIRST_EXT       object
DATE_LAST_EXT        object
HOOD_NUM             object
HOOD_NAME            object
BORO_NAME            object
MTM8_X              float64
MTM8_Y              float64
LONGITUDE           float64
LATITUDE            float64
LONG_LAT             object
MTM_X_Y              object
DEC_MONTH             int64
DEC_ISSUE             int64
DATE_DIFF           float64
dtype: object


In [4]:
df_base['DATE_DECLARATION'] = pd.to_datetime(df_base['DATE_DECLARATION'])

In [None]:
# # Inspication for epicenter calculation:
# # https://towardsdatascience.com/transforming-categorical-data-for-usability-in-machine-learning-predictions-90459c3fc967?gi=2253c23cb822

# # Ref. : https://stackoverflow.com/questions/4913349/haversine-formula-in-python-bearing-and-distance-between-two-gps-points
# def haversine_np(lon1, lat1, lon2, lat2):
#     lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    
#     dlon = lon2 - lon1
#     dlat = lat2 - lat1    
    
#     a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2    
#     c = 2 * np.arcsin(np.sqrt(a))
    
#     km = 6367 * c
#     return km

# # Create distance column in dataframe, which returns km values
# df['EPI_DIST_1'] = haversine_np(
#     df['LONGITUDE'],df['LATITUDE'], -73.585636, 45.527404)

# df['EPI_DIST_2'] = haversine_np(
#     df['LONGITUDE'],df['LATITUDE'], -73.563652, 45.528809)

In [None]:
# #We have this for month and year:

# df['DEC_D'] = (pd.DatetimeIndex(df['DATE_DECLARATION']).year).map(str) + (
#     pd.DatetimeIndex(df['DATE_DECLARATION']).month).map(str)
    
# df['PRIOR_D'] = (pd.DatetimeIndex(df['DATE_PRIOR_INSP']).year).map(str) + (
#     pd.DatetimeIndex(df['DATE_PRIOR_INSP']).month).map(str)

# df['FIRST_D'] = (pd.DatetimeIndex(df['DATE_FIRST_EXT']).year).map(str) + (
#     pd.DatetimeIndex(df['DATE_FIRST_EXT']).month).map(str)

# df['LAST_D'] = (pd.DatetimeIndex(df['DATE_LAST_EXT']).year).map(str) + (
#     pd.DatetimeIndex(df['DATE_LAST_EXT']).month).map(str)

# # Create difference between LAST and FIRST Extermination
# df['D_DIFF_B'] = round( (df['DATE_LAST_EXT'] - df['DATE_FIRST_EXT'] ) 
#                                 / np.timedelta64(1,'D') )
#     # ['D_DIFF_B'] = ['DATE_LAST_EXT'] - ['DATE_FIRST_EXT']

# # Create difference between LAST and FIRST Extermination
# df['D_DIFF_C'] = round( (df['DATE_FIRST_EXT'] - df['DATE_PRIOR_INSP'] ) 
#                                 / np.timedelta64(1,'D') )
#     # ['D_DIFF_C'] = ['DATE_FIRST_EXT'] - ['DATE_PRIOR_INSP']

    
# # rename column
# df['D_DIFF_A'] = df['DATE_DIFF'].copy() # ['D_DIFF_A'] = ['DATE_DECLARATION'] - ['DATE_PRIOR_INSP']

In [None]:
# df.isnull().sum()

# Feature Engineering

In [5]:
# Ref.: https://github.com/arybressane/CEBD1260-BIG-DATA-ANALYTICS/blob/master/week7/clustering-dbscan-map.ipynb

# select a period
df = df_base[df_base['DATE_DECLARATION']>='2018-01-01']

# # remove lines with no location
df = df[(df['MTM8_X']>0)&(df['MTM8_Y']>0)]

# adapt X and Y to the visualization
df['MTM8_X'] = df.apply(lambda x: transform(Proj(init='epsg:4326'),
                                       Proj(init='epsg:3857'), x['LONGITUDE'], x['LATITUDE'])[1], axis=1)

df['MTM8_Y'] = df.apply(lambda x: transform(Proj(init='epsg:4326'), 
                                       Proj(init='epsg:3857'), x['LONGITUDE'], x['LATITUDE'])[0], axis=1)

X_columns = ['MTM8_X', 'MTM8_Y'] # only two input columns are used, regardless of date and distance from epicenter
df = df[X_columns]

In [6]:
df

Unnamed: 0,MTM8_X,MTM8_Y
388,5.705930e+06,-8.188897e+06
389,5.707264e+06,-8.187637e+06
390,5.709142e+06,-8.193115e+06
391,5.701257e+06,-8.195114e+06
392,5.707259e+06,-8.191682e+06
393,5.705930e+06,-8.188897e+06
394,5.706389e+06,-8.188774e+06
395,5.706583e+06,-8.192794e+06
404,5.719110e+06,-8.194897e+06
405,5.699442e+06,-8.190936e+06


# Model Training

In [7]:

model = DBSCAN(eps=8.0, min_samples=10)
model.fit(df[['MTM8_X', 'MTM8_Y']])

# Ref.: https://stackoverflow.com/questions/26666367/scikit-dbscan-eps-and-min-sample-value-determination
# Number of clusters in labels, ignoring noise if present.
db=model.fit(df[['MTM8_X', 'MTM8_Y']])
labels = db.labels_
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)


cluster_labels = model.labels_
n_clusters = len(set(cluster_labels))
print(collections.Counter(cluster_labels))

df['cluster'] = cluster_labels

Estimated number of clusters: 48
Counter({-1: 2790, 18: 60, 4: 49, 15: 39, 2: 37, 3: 31, 1: 28, 16: 23, 31: 23, 30: 22, 6: 21, 29: 21, 21: 20, 28: 20, 32: 19, 39: 18, 40: 16, 0: 15, 9: 15, 10: 15, 25: 15, 34: 15, 47: 15, 14: 14, 22: 14, 24: 14, 26: 14, 36: 13, 38: 13, 5: 12, 7: 12, 8: 12, 11: 12, 19: 12, 27: 12, 41: 12, 13: 11, 17: 11, 20: 11, 33: 11, 42: 11, 12: 10, 23: 10, 35: 10, 37: 10, 43: 10, 44: 10, 45: 10, 46: 10})


In [8]:
df

Unnamed: 0,MTM8_X,MTM8_Y,cluster
388,5.705930e+06,-8.188897e+06,-1
389,5.707264e+06,-8.187637e+06,0
390,5.709142e+06,-8.193115e+06,-1
391,5.701257e+06,-8.195114e+06,-1
392,5.707259e+06,-8.191682e+06,-1
393,5.705930e+06,-8.188897e+06,-1
394,5.706389e+06,-8.188774e+06,-1
395,5.706583e+06,-8.192794e+06,1
404,5.719110e+06,-8.194897e+06,-1
405,5.699442e+06,-8.190936e+06,2


In [9]:
p = figure(y_range=(5641788.0, 5751788.0), x_range=(-8152883, -8252883))
p.add_tile(CARTODBPOSITRON)

latitude  = list(df[df['cluster']>-1]['MTM8_X'].values)
longitude = list(df[df['cluster']>-1]['MTM8_Y'].values)

colormap = list(bokeh.palettes.viridis(n_clusters))
colors = [colormap[x] for x in df[df['cluster']>-1]['cluster']]
source = ColumnDataSource(data=dict(longitude=longitude, latitude=latitude))
p.circle(x=longitude, y=latitude, color=colors, fill_alpha=0.2, size=5)
show(p)

# Model Evaluation

In [10]:
# Inter-Cluster
centroids = []
for cluster in sorted(set(model.labels_)):
    centroids.append(df[df['cluster']==cluster][X_columns].mean().values)
distances = []
for c1 in centroids:
    for c2 in centroids:
        distances.append(euclidean_distances(c1.reshape(-1, 1), c2.reshape(-1, 1))[0][0])
print('Inter Cluster distance', np.mean(distances))

# Intra-Cluster
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df[df['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(-1, 1), v.values.reshape(-1, 1))[0][0])
print('Intra Cluster distance', np.mean(distances))

# Inertia
distances = []
for cluster in sorted(set(model.labels_)):
    df_filter = df[df['cluster']==cluster]
    centroid = df_filter[X_columns].mean().values
    for k, v in df_filter[X_columns].iterrows():
        distances.append(euclidean_distances(centroid.reshape(1, -1), v.values.reshape(1, -1), squared=True)[0][0])
print('Inertia', np.sum(distances))



Inter Cluster distance 8783.130333778312
Intra Cluster distance 4067.2272863555218
Inertia 220143488071.59375


In [11]:
# Ref.: https://medium.com/@elutins/dbscan-what-is-it-when-to-use-it-how-to-use-it-8bd506293818
print("Silhouette Coef: %0.3f" % metrics.silhouette_score(df[['MTM8_X', 'MTM8_Y']],labels))

# "A silhouette score ranges from -1 to 1, with -1 being the worst score possible and 1 being the best score." from Ref.
# "Silhouette scores of 0 suggest overlapping clusters." from Ref.

Silhouette Coef: -0.439


# Describe clusters

###### Describe your clusters (write a description/summary for each one of the clusters on the notebook)

By increasing epsilon, the amount of noise is increased, and by decreasing the min_samples in tandem, the silhouette score improve from a negative float value to a positive value near 0.  

The result only considers the longitude and latitude for the inputs, without any consideration for the distance from epicenter and for the dates relative to the intersection. Although, these variable could have been introduced into the model and thereafter a filter could have been applied, in order to review only the exertinations within a radius or distance from an epicenter in the Plateau area.

Only the 2018 period was considered for analysis, in an effort to improve run time.