<IMG SRC="https://github.com/jacquesroy/byte-size-data-science/raw/master/images/Banner.png" ALT="BSDS Banner" WIDTH=1195 HEIGHT=200>

<table align="left">
    <tr><td>
<a rel="license" href="http://creativecommons.org/licenses/by/4.0/"><img alt="Creative Commons License" style="border-width:0" src="https://i.creativecommons.org/l/by/4.0/88x31.png" /></a></td><td>This work is licensed under a <a rel="license" href="http://creativecommons.org/licenses/by/4.0/">Creative Commons Attribution 4.0 International License</a>.</td>
    </tr>
    <tr><td>Jacques Roy, Byte Size Data Science</td><td> </td></tr>
    </table>

# Understanding / Modeling
We are trying two types of modeling:
- Hierarchical clustering
- DBSCAN (Density-based spatial clustering)

In [None]:
from IPython.display import IFrame

IFrame(src="https://www.youtube.com/embed/NoZfnj4vbAg?rel=0&amp;controls=0&amp;showinfo=0", width=560, height=315)

## Import the appropriate libraries and set up needed connections

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import ibm_db
import ibm_db_dbi

from ftplib import FTP
import requests, zipfile, io
import math
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
!pip install folium 2>&1 >foliumpip.out
import folium

In [None]:
credentials = {
    'username': 'bluadmin',
    'password': """PASSWORD""",
    'sg_service_url': 'https://sgmanager.ng.bluemix.net',
    'database': 'BLUDB',
    'host': 'dashdb-. . .',
    'port': '50001',
    'url': 'https://undefined'
}
schema="CHICAGO"

In [None]:
dsn = (
    "DRIVER={{IBM DB2 ODBC DRIVER}};"
    "DATABASE={0};"
    "HOSTNAME={1};"
    "PORT={2};"
    "PROTOCOL=TCPIP;"
    "SECURITY=ssl;"
    "UID={3};"
    "PWD={4};").format(credentials['database'], credentials['host'],
                       credentials['port'], credentials['username'],
                       credentials['password'])

conn = ibm_db.connect(dsn, "", "")
pconn = ibm_db_dbi.Connection(conn)

## Chicago accident data
We are using the Chicago accident date that we discussed in videos 59 and 60.

In this notebook, we'll use a sampling subset of out entire dataset (157,852 rows).

## Categorical values distribution
Please review video 60 to find out what was done for the categorical values.

I should havew followed the <column_name>_ID naming convention for the numerical categorical attributes.
That would have been more consistent and saved me some coding.

In [None]:
categorical_columns = ['TRAFFIC_CONTROL_DEVICE_ID','DEVICE_CONDITION_ID','WEATHER_CONDITION_ID','LIGHTING_CONDITION_ID',
           'FIRST_CRASH_TYPE_ID','TRAFFICWAY_TYPE_ID','ALIGNMENT_ID','ROADWAY_SURFACE_COND_ID','ROAD_DEFECT_ID',
           'REPORT_TYPE_ID','CRASH_TYPE_ID','DAMAGE_ID','PRIM_CONTRIBUTORY_CAUSE_ID','SEC_CONTRIBUTORY_CAUSE_ID',
           'WORK_ZONE_TYPE_ID','MOST_SEVERE_INJURY_ID'
          ]
other_cat_columns = ['POSTED_SPEED_LIMIT','LANE_CNT','NUM_UNITS', 'INJURIES_TOTAL',
                     'CRASH_HOUR','CRASH_DAY_OF_WEEK','CRASH_MONTH']

cat_all = categorical_columns + other_cat_columns

# Identify the columns we want to drop from the modeling
dropped_columns = {'RD_NO','CRASH_DATE_EST_I','CRASH_DATE','POSTED_SPEED_LIMIT','DATE_POLICE_NOTIFIED','STREET_NO',
                   'STREET_NAME'}

## Get the data
We need to build the SQL statement that accesses all the referenced tables.<br/>
This applies the grouping we want for each categorical attribute.

In [None]:
# Quick query to get the column names
sql = """
SELECT NAME,COLTYPE,LENGTH,SCALE, NULLS
FROM SYSIBM.SYSCOLUMNS
WHERE TBNAME = 'CHICAGOACCIDENTS'
AND   TBCREATOR = '{0}'
ORDER BY COLNO;
""".format(schema)
tabdef_pd = pd.read_sql(sql, pconn)

In [None]:
sampling = " TABLESAMPLE SYSTEM (2) "
# limitrows = " LIMIT 2000 "
limitrows = ""
sql = "SELECT "

# Identify the columns
for row in tabdef_pd.iterrows() :
    if row[1]['NAME'] in dropped_columns : # skip the columns we don't want to use in modeling
        continue
    if row[1]['NAME'] in categorical_columns :
        sql = sql + "{0}.{1}_table.grouping as {1}_ID,\n".format(schema,row[1]['NAME'][:-3])
    elif row[1]['NAME'] in other_cat_columns :
        sql = sql + "{0}.{1}_table.grouping as {1}_ID,\n".format(schema,row[1]['NAME'])
    else :
        sql = sql + row[1]['NAME'] + ","

sql = sql[:-1] + "\n FROM {0}.ChicagoAccidents {1},".format(schema,sampling)

# Add the other tables
for name in cat_all :
    if name in dropped_columns : # skip the columns we don't want to use in modeling
        continue
    if name in other_cat_columns :
        sql = sql + "{0}.{1}_table,".format(schema,name)
    else:
        sql = sql + "{0}.{1}_table,".format(schema,name[:-3])
sql = sql[:-1] + "\n WHERE "

# Add the conditions
for name in cat_all :
    if name in dropped_columns : # skip the columns we don't want to use in modeling
        continue
    if name in other_cat_columns :
        sql = sql + "{0}.ChicagoAccidents.{1} = {0}.{1}_table.id\nAND ".format(schema,name)
    else:
        sql = sql + "{0}.ChicagoAccidents.{1}_id = {0}.{1}_table.id\nAND ".format(schema,name[:-3]) 

sql = sql[:-4] + " {0};".format(limitrows)

data_pd = pd.read_sql(sql, pconn)
print("Number of records: {0}".format(data_pd.shape[0]))
data_pd.head(5)

## Hierarchical Clustering

In [None]:
from sklearn.preprocessing import normalize
from sklearn.preprocessing import LabelEncoder #, OneHotEncoder
# from sklearn.preprocessing import StandardScaler

In [None]:
# Character columns to encode, 
char_columns = ["INTERSECTION_RELATED_I","NOT_RIGHT_OF_WAY_I","HIT_AND_RUN_I",
                "STREET_DIRECTION","PHOTOS_TAKEN_I","STATEMENTS_TAKEN_I","DOORING_I",
                "WORK_ZONE_I","WORKERS_PRESENT_I"]
encoded_pd = data_pd.copy()
for col in char_columns:
    encoded_pd[col] = LabelEncoder().fit_transform(encoded_pd[col])
encoded_pd = encoded_pd.drop(["LATITUDE","LONGITUDE"],axis=1)
encoded_pd.head(5)

In [None]:
data_scaled = normalize(encoded_pd)

In [None]:
# Import objects assuming the k-means section was skipped
from scipy import ndimage 
from scipy.cluster import hierarchy 
from scipy.spatial import distance_matrix 
from matplotlib import pyplot as plt 
from sklearn import manifold, datasets 
from sklearn.cluster import AgglomerativeClustering 
from sklearn.datasets.samples_generator import make_blobs 

## Gower Library
This library calculates the distance between records taking into account that categorical values are either the same or not.

In [None]:
# Library to deal with categorical values without onehot encoding
!pip install gower

### Thank you to Robert Uleman for providing the function in the following two cells

In [None]:
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import linkage, fcluster
from gower import gower_matrix
def get_linkage(df, weights, cat_columns):
    '''
    Perform hierarchical/agglomerative clustering.
    Parameters:
        df          input DataFrame
        weights     list of weights, same length and in same order as df.columns
        cat_columns list of booleans, same length and in same order as df_columns,
                    indicating whether a column is categorical or not
    Returns:
        The hierarchical clustering encoded as a linkage matrix
    '''
    pairwise_dist = gower_matrix(data_x=df, weight=weights, cat_features=cat_columns)
    return linkage(squareform(pairwise_dist), method='complete')

In [None]:
from collections import OrderedDict
def make_weights(df, default=1, **kwargs):
    '''
    Create a {column_name:weight} ordered dictionary for Gower distance computation.
    Gower method depends on list parameters coordinated in order, so must preserve column order.
    Parameters:
        df        Pandas Dataframe. Create a weight for each of the columns
        default   Default weight: columns get this weight unless explicitly overridden
        kwargs    column_name=weight pairs to override the default assignment
    Returns:
        An OrderedDict of {column_name:weight} pairs for all columns in df
    '''
    # Set default weight of 1 for all columns
    weights = OrderedDict(zip(df.columns, [default]*len(df.columns)))
    # Override the defaults with provided values (NOTE: this is an in-place method)
    weights.update(kwargs)
    return weights

In [None]:
df = pd.DataFrame(data_scaled)
df.columns = encoded_pd.columns

In [None]:
w_categorical_columns = ['TRAFFIC_CONTROL_DEVICE_ID','DEVICE_CONDITION_ID', 
                       'WEATHER_CONDITION_ID', 'LIGHTING_CONDITION_ID','FIRST_CRASH_TYPE_ID',
                       'TRAFFICWAY_TYPE_ID', 'ALIGNMENT_ID','ROADWAY_SURFACE_COND_ID', 
                       'ROAD_DEFECT_ID', 'REPORT_TYPE_ID','CRASH_TYPE_ID',
                       "INTERSECTION_RELATED_I","NOT_RIGHT_OF_WAY_I","HIT_AND_RUN_I",
                       'DAMAGE_ID', 'PRIM_CONTRIBUTORY_CAUSE_ID','SEC_CONTRIBUTORY_CAUSE_ID'
                       "STREET_DIRECTION","PHOTOS_TAKEN_I",
                       "STATEMENTS_TAKEN_I","DOORING_I","WORK_ZONE_I",
                       'WORK_ZONE_TYPE_ID',"WORKERS_PRESENT_I",'MOST_SEVERE_INJURY_ID'
                      ]


In [None]:
# https://www.thinkdatascience.com/post/2019-12-16-introducing-python-package-gower/
weights_dict = make_weights(df)
Z = get_linkage(df,
                weights    =np.asarray(list(weights_dict.values())),
                cat_columns=df.columns.isin(w_categorical_columns))

In [None]:
from scipy.cluster.hierarchy import dendrogram

plt.figure(figsize=(15, 7))  
plt.title("Dendrograms")  
dend = dendrogram(Z)

In [None]:
cutoff=0.58

In [None]:
plt.figure(figsize=(15, 7))  
plt.title("Dendrograms")  
plt.axhline(y=cutoff, color='r', linestyle='--')
dend = dendrogram(Z)

In [None]:
vals = fcluster(Z, cutoff, 'distance')
nb_clusters = vals.max()
clusters_counts = np.unique(vals,return_counts=True)
print("Number of clusters: {0}".format(nb_clusters))

In [None]:
# add the group to the data and get only the ones that were grouped
data_pd['hgroup'] = -1
data_pd.loc[data_pd.drop(["LATITUDE","LONGITUDE"], axis=1).dropna().index, ['hgroup']] = vals
pdata_pd = data_pd.loc[data_pd.hgroup > -1]

## Display the clusters on a map
Display each address as a point with a specific color by cluster

### Prep the cluster colors

In [None]:
# returns an array of 4 dimensions arrays
import matplotlib.cm as cm
colors = cm.rainbow(np.linspace(0, 1, nb_clusters + 1))

In [None]:
rgbcolors = []
for v in colors :
    col = np.floor(v * 255)
    r = int(col[0])
    g = int(col[1])
    b = int(col[2])
    rgbcolors.append('#' + '{0:#08x}'.format(((r * 65536) + (g * 256) + b))[2:])

In [None]:
# See: https://medium.com/@bobhaffner/creating-a-legend-for-a-folium-map-c1e0ffc34373
# Also: https://fontawesome.com/v4.7.0/icons/
legend_html = '''
<div style="position: fixed; 
     top: 50px; right: 50px; width: 150px; height: 150px; 
     border:2px solid grey; z-index:9999; font-size:14px;
     ">&nbsp; <u><b>CLUSTERS</b></u> <br/>
'''
for v in range(nb_clusters) :
    legend_html = legend_html + \
    '''
    &nbsp; <i class="fa fa-square" style="color:{2}"></i>
    &nbsp; ({0}) Cluster-{1} &nbsp;<br/>
    '''.format(clusters_counts[1][v],(v + 1),rgbcolors[v])
legend_html = legend_html + '</div>'
# print(legend_html)

### Display the map

In [None]:
# Calculate a center point for the map
latlong = pdata_pd[['LATITUDE','LONGITUDE']].mean()

chi_map = folium.Map(location=[latlong[0], latlong[1]], zoom_start=10, width="100%", height="100%")

for row in pdata_pd.itertuples() :
    tooltip_content="BEAT_OF_OCCURRENCE: {0}<br/>Cluster: {1}<br/>".format(
        row.BEAT_OF_OCCURRENCE,row.hgroup)
    folium.Circle(
    radius=10,
    location=[row.LATITUDE, row.LONGITUDE],
    popup=row.hgroup,
    color=rgbcolors[row.hgroup - 1],
    tooltip=tooltip_content,
    fill=True,
    fill_color=rgbcolors[row.hgroup - 1]
).add_to(chi_map)
chi_map.get_root().html.add_child(folium.Element(legend_html))  
chi_map

## DBSCAN
Density-based spatial clustering.<br/>
Locates regions of high density that are separated from one another by regions of low density.

info:
- https://scikit-learn.org/stable/modules/clustering.html#clustering
- https://scikit-learn.org/stable/modules/classes.html#module-sklearn.cluster
- https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html
- https://hdbscan.readthedocs.io/en/latest/comparing_clustering_algorithms.html

In [None]:
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

In [None]:
# Get the location information
sql = """
  SELECT LATITUDE, LONGITUDE
  FROM CHICAGO.ChicagoAccidents TABLESAMPLE SYSTEM(20)
  ;
"""

data_pd = pd.read_sql(sql, pconn)
print("Number of records: {0}".format(data_pd.shape[0]))
data_pd.head(5)

In [None]:
data_np = data_pd[['LATITUDE','LONGITUDE']].to_numpy()

# [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’]
# Use default values except for eps
db = DBSCAN(eps=0.001, min_samples=20, metric='euclidean', metric_params=None,
            algorithm='auto', leaf_size=30, p=None, n_jobs=None).fit(data_np)

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of noise points: %d' % n_noise_)

# Plot result

# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
          for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = data_np[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
             markeredgecolor='k', markersize=14)

    #xy = data_np[class_member_mask & ~core_samples_mask]
    #plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
    #         markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

In [None]:
import matplotlib.cm as cm
colors = cm.rainbow(np.linspace(0, 1, n_clusters_ + 1))

rgbcolors = []
for v in colors :
    col = np.floor(v * 255)
    r = int(col[0])
    g = int(col[1])
    b = int(col[2])
    rgbcolors.append('#' + '{0:#08x}'.format(((r * 65536) + (g * 256) + b))[2:])

In [None]:
# Display the average center of each group

all_recs = np.append(data_np, db.labels_.reshape((db.labels_.shape[0],1)), 1)
all_recs = all_recs[np.logical_or.reduce([db.labels_ > -1])]
unique_elements, counts_elements = np.unique(all_recs[...,2], return_counts=True)

results=[]
for x in sorted(np.unique(all_recs[...,2])):
    results.append([np.average(all_recs[np.where(all_recs[...,2]==x)][...,0]), 
                    np.average(all_recs[np.where(all_recs[...,2]==x)][...,1]), x])

latlong = all_recs.mean(axis=0)[0:2]

chi_map = folium.Map(location=[latlong[0], latlong[1]], zoom_start=10, width="90%", height="90%")

for coord in results:
    tooltip_content="Cluster: {0}, count: {1}".format(coord[2].astype(int),counts_elements[coord[2].astype(int)] )
    folium.Circle(radius=10,
                  location=[coord[0], coord[1]],
                  # popup=row.hgroup,
                  color=rgbcolors[coord[2].astype(int) - 1],
                  tooltip=tooltip_content,
                  fill=True,
                  fill_color=rgbcolors[coord[2].astype(int) - 1]
    ).add_to(chi_map)
    
chi_map

## Convert the point in each cluster to a polygon/multipolygon
see: https://shapely.readthedocs.io/en/stable/manual.html

In [None]:
!pip install geopandas 2>&1 >pipgeopandas.txt
import geopandas as gp

In [None]:
data_pd['cgroup'] = db.labels_
data_pd['cnt'] = [1] * db.labels_.shape[0]

geo_gpd = gp.GeoDataFrame(data_pd, geometry=gp.points_from_xy(data_pd.LONGITUDE, data_pd.LATITUDE))
geo_gpd.head(5)

In [None]:
# Use the DBSCAN eps value for the buffer
group_gpd = geo_gpd[['cgroup','geometry','cnt']].dissolve(by='cgroup', aggfunc='sum').reset_index().drop([0])
# group2_gpd = gp.GeoDataFrame(group_gpd[['cgroup','cnt']],geometry=group_gpd.geometry.buffer(0.003))
group2_gpd = gp.GeoDataFrame(group_gpd[['cgroup','cnt']],geometry=group_gpd.geometry.convex_hull)
group2_gpd.head(5)

In [None]:
latlong = geo_gpd[['LATITUDE','LONGITUDE']].mean(axis=0)

chi_map = folium.Map(location=[latlong[0], latlong[1]], zoom_start=10, width="90%", height="90%")

# geom2 = chi_gdf[chi_gdf['NAME']=='Chicago'].reset_index()

for ix in range(group2_gpd['cnt'].count()) :
    folium.GeoJson(
        group2_gpd.iloc[ix]['geometry'],
        name="cluster-{0}".format(group2_gpd.iloc[ix]['cgroup']),
        tooltip="Cluster: {0}, count: {1}".format(group2_gpd.iloc[ix]['cgroup'],group2_gpd.iloc[ix]['cnt'] )
    ).add_to(chi_map)


folium.LayerControl().add_to(chi_map)
chi_map