In [1]:
# Select modeling techniques: Determine which algorithms to try (e.g. regression, neural
# net).
# 
# Generate test design: Pending your modeling approach, you might need to split the data
# into training, test, and validation sets.
# 
# Build model: As glamorous as this might sound, this might just be executing a few
# lines of code like “reg = LinearRegression().fit(X, y)”.
# 
# Assess model: Generally, multiple models are competing against each other, and the
# data scientist needs to interpret the model results based on domain knowledge, the
# pre-defined success criteria, and the test design.

In [2]:
# Draft conclusions
# --------------------------------------------------------------------------------------

# - Big cities are more prone to accidents (ofc, more people, more cars, more accidents)
# - Some clusters could be merged, this is specially true for clusters in cities
# - Liked the idea of removing noise (-1) from the data, but not sure if it's actually
# noise (check original data). IMO these sparse points are not relevant for the analysis
# 
# - Filter our some results, so there are less to analyze
# - Minimum number of clusters: 15
# - Maximum number of clusters: sqrt(n_samples) = 140
# 
# - We want accidents close to each other: small eps
# - We want clusters with many accidents: high min_samples
# 
# - If min_samples is too high, too few cluster (only around big cities)
# - If min_samples is too low, too much clusters
# - If eps is too big, too big clusters
# - If eps is too small, too small clusters

In [1]:
import os
import itertools
import shutil

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from geopy.distance import great_circle
from sklearn import metrics
from sklearn.cluster import DBSCAN

pd.set_option("display.max_columns", None)
YEAR = "2021"

shutil.rmtree(f"../output/d-modeling")
os.makedirs(f"../output/d-modeling", exist_ok=True)
os.makedirs(f"../data/d-modeling", exist_ok=True)

def haversine(coords1, coords2):
    return great_circle(coords1, coords2).kilometers

In [2]:
# Group features
# --------------------------------------------------------------------------------------

# espaciais-1: latitude, longitude
# espaciais-2: uf, br, km, municipio
# caracteristicas: condicao_metereologica, causa_acidente, tipo_acidente, classificacao_acidente
# estatisticas: pessoas, mortos, feridos_leves, feridos_graves, ilesos, ignorados, feridos, veiculos
# 
# temporais: ano, mes, dia, hora, minuto, dia_semana, fase_dia
# pista: sentido_via, tipo_pista, tracado_via, uso_solo
# prf: regional, delegacia, uop

# Columns to consider for clustering
espaciais_1 = ["latitude", "longitude"]
espaciais_2 = ["uf", "br", "km", "municipio"]
caracteristicas = ["condicao_metereologica", "causa_acidente", "tipo_acidente", "classificacao_acidente"]
estatisticas = ["pessoas", "mortos", "feridos_leves", "feridos_graves", "ilesos", "ignorados", "feridos", "veiculos"]

# Columns to NOT consider for clustering
# temporais = ["ano", "mes", "dia", "hora", "minuto", "dia_semana", "fase_dia"]
# pista = ["sentido_via", "tipo_pista", "tracado_via", "uso_solo"]
# prf = ["regional", "delegacia", "uop"]

In [11]:
df.head()

Unnamed: 0,dia_semana,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,pessoas,mortos,feridos_leves,feridos_graves,ilesos,ignorados,feridos,veiculos,latitude,longitude,regional,delegacia,uop,ano,mes,dia,hora,minuto,cluster
0,sexta-feira,PR,277.0,51.3,SAO JOSE DOS PINHAIS,Pista Escorregadia,Saída de leito carroçável,Com Vítimas Feridas,Pleno dia,Decrescente,Garoa/Chuvisco,Dupla,Curva,Não,4,0,1,0,3,0,1,3,-25.59516,-48.907008,SPRF-PR,DEL01-PR,UOP05-DEL01-PR,2021,1,1,15,45,0
1,sexta-feira,SC,470.0,79.1,INDAIAL,Transitar na contramão,Colisão frontal,Com Vítimas Fatais,Pleno dia,Crescente,Nublado,Simples,Reta,Não,9,2,5,1,0,1,6,3,-26.951565,-49.306534,SPRF-SC,DEL04-SC,UOP01-DEL04-SC,2021,1,1,17,10,1
2,sexta-feira,SC,470.0,130.0,LONTRAS,Acessar a via sem observar a presença dos outr...,Colisão transversal,Com Vítimas Feridas,Plena Noite,Decrescente,Nublado,Simples,Não Informado,Sim,4,0,0,2,2,0,2,2,-27.160376,-49.55658,SPRF-SC,DEL04-SC,UOP02-DEL04-SC,2021,1,1,19,50,1
3,sexta-feira,PR,116.0,59.0,CAMPINA GRANDE DO SUL,Entrada inopinada do pedestre,Atropelamento de Pedestre,Com Vítimas Fatais,Plena Noite,Crescente,Chuva,Dupla,Não Informado,Sim,2,1,0,0,1,0,0,1,-25.303357,-48.943789,SPRF-PR,DEL01-PR,UOP02-DEL01-PR,2021,1,1,20,40,2
4,sábado,SC,101.0,112.0,ITAJAI,Trafegar com motocicleta (ou similar) entre as...,Colisão traseira,Com Vítimas Feridas,Pleno dia,Decrescente,Sol,Dupla,Não Informado,Não,4,0,1,1,2,0,2,3,-26.84589,-48.721313,SPRF-SC,DEL04-SC,UOP04-DEL04-SC,2021,1,2,9,25,3


In [3]:
# Plotting clusters on a map
# --------------------------------------------------------------------------------------

def plot(labels, dst, title):
    # Read original data
    file = f"../data/c-data-preparation/dataset.csv"
    df = pd.read_csv(file)

    num_clusters = len(np.unique(labels))
    color_scale = ["hsl(" + str(h) + ",50%" + ",50%)" for h in np.linspace(0, 360, num_clusters)]

    fig = go.Figure()

    for cluster_label, color in zip(np.unique(labels), color_scale):
        # Skip noise
        if cluster_label == -1:
            continue

        cluster_mask = labels == cluster_label
        cluster_data = df[cluster_mask]

        scatter = go.Scattermapbox(
            lat=cluster_data.latitude,
            lon=cluster_data.longitude,
            mode="markers",
            marker=dict(size=7.5, color=color),
            # text=f"({cluster_label}) {cluster_data.causa_acidente}",
            text=f"{cluster_label}",
            showlegend=False,
        )

        fig.add_trace(scatter)

    fig.update_layout(
        title=title,
        title_x=0.5,
        title_y=0.95,
        mapbox_style="open-street-map",
        mapbox=dict(center=dict(lat=-28, lon=-52), zoom=4.5),
        margin={"l": 0, "r": 0, "t": 0, "b": 0},
    )

    # fig.show()
    fig.write_html(f"{dst}.html")

In [4]:
# Build model (euclidean)
# --------------------------------------------------------------------------------------
# 
# - eps 0.75: too big clusters
# - eps 0.75: too few n_noise

file = "../data/c-data-preparation/prepared.csv"
df = pd.read_csv(file)

METRIC = "euclidean"
EPS = [0.25, 0.5, 0.75]
MIN_SAMPLES = [10, 25, 50]
COLUMNS = [espaciais_1, espaciais_1 + espaciais_2]

length = len(df)
minimum = 15
maximum = int(np.sqrt(length))

hyperparameters = itertools.product(EPS, MIN_SAMPLES, COLUMNS)
for i, (eps, min_samples, columns) in enumerate(hyperparameters):
    header = f"metric: {METRIC}, eps: {eps}, min_samples: {min_samples}"
    print(header)
    print("-" * len(header))

    X_train = df[columns].values

    model = DBSCAN(eps=eps, min_samples=min_samples, metric=METRIC)
    model.fit(X_train)

    # Number of clusters in labels, ignoring noise if present.
    labels = model.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    
    if not (minimum <= n_clusters <= maximum):
        print(f"Number of clusters is outside allowed range")
        print(f"Skipping, condition not met: {minimum} <= {n_clusters} <= {maximum}\n")
        continue

    score = metrics.silhouette_score(X_train, labels)

    dst = f"../output/d-modeling/{METRIC}-{eps:.2f}-{min_samples}-{i}"
    title = f"n_clusters: {n_clusters}, n_noise: {n_noise}, score: {score:.3f}"

    print(f"Columns: {columns}")
    print(f"Estimated number of clusters: {n_clusters}")
    print(f"Estimated number of noise points: {n_noise}")
    print(f"Silhouette coefficient: {score:.3f}\n")

    plot(labels, dst, title)

metric: euclidean, eps: 0.25, min_samples: 10
---------------------------------------------
Number of clusters is outside allowed range
Skipping, condition not met: 15 <= 2 <= 140

metric: euclidean, eps: 0.25, min_samples: 10
---------------------------------------------
Number of clusters is outside allowed range
Skipping, condition not met: 15 <= 290 <= 140

metric: euclidean, eps: 0.25, min_samples: 25
---------------------------------------------
Number of clusters is outside allowed range
Skipping, condition not met: 15 <= 3 <= 140

metric: euclidean, eps: 0.25, min_samples: 25
---------------------------------------------
Number of clusters is outside allowed range
Skipping, condition not met: 15 <= 183 <= 140

metric: euclidean, eps: 0.25, min_samples: 50
---------------------------------------------
Number of clusters is outside allowed range
Skipping, condition not met: 15 <= 3 <= 140

metric: euclidean, eps: 0.25, min_samples: 50
---------------------------------------------

In [7]:
# Build model (haversine)
# --------------------------------------------------------------------------------------
# 
# - eps < 5: too small/few clusters

file = "../data/c-data-preparation/dataset.csv"
df = pd.read_csv(file)

METRIC = haversine
EPS = [2.5, 5.0, 7.5]
MIN_SAMPLES = [25, 50, 75]
COLUMNS = [espaciais_1]

length = len(df)
minimum = 15
maximum = int(np.sqrt(length))

hyperparameters = itertools.product(EPS, MIN_SAMPLES, COLUMNS)
for i, (eps, min_samples, columns) in enumerate(hyperparameters):
    header = f"metric: haversine, eps: {eps}, min_samples: {min_samples}"
    print(header)
    print("-" * len(header))

    X_train = df[columns].values

    model = DBSCAN(eps=eps, min_samples=min_samples, metric=METRIC)
    model.fit(X_train)

    # Number of clusters in labels, ignoring noise if present.
    labels = model.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    
    if not (minimum <= n_clusters <= maximum):
        print(f"Number of clusters is outside allowed range")
        print(f"Skipping, condition not met: {minimum} <= {n_clusters} <= {maximum}\n")
        continue

    score = metrics.silhouette_score(X_train, labels)

    dst = f"../output/d-modeling/haversine-{eps:.2f}-{min_samples}-{i}"
    title = f"n_clusters: {n_clusters}, n_noise: {n_noise}, score: {score:.3f}"

    print(f"Columns: {columns}")
    print(f"Estimated number of clusters: {n_clusters}")
    print(f"Estimated number of noise points: {n_noise}")
    print(f"Silhouette coefficient: {score:.3f}\n")

    plot(labels, dst, title)

metric: haversine, eps: 2.5, min_samples: 25
--------------------------------------------
Columns: ['latitude', 'longitude']
Estimated number of clusters: 78
Estimated number of noise points: 7811
Silhouette coefficient: -0.013

metric: haversine, eps: 2.5, min_samples: 50
--------------------------------------------
Columns: ['latitude', 'longitude']
Estimated number of clusters: 30
Estimated number of noise points: 11707
Silhouette coefficient: -0.198

metric: haversine, eps: 2.5, min_samples: 75
--------------------------------------------
Columns: ['latitude', 'longitude']
Estimated number of clusters: 24
Estimated number of noise points: 13629
Silhouette coefficient: -0.313

metric: haversine, eps: 5.0, min_samples: 25
--------------------------------------------
Columns: ['latitude', 'longitude']
Estimated number of clusters: 73
Estimated number of noise points: 4438
Silhouette coefficient: 0.015

metric: haversine, eps: 5.0, min_samples: 50
--------------------------------------

In [9]:
# Build final model
# --------------------------------------------------------------------------------------

FILE = "../data/c-data-preparation/prepared.csv"
METRIC = "euclidean"
EPS = 0.5
MIN_SAMPLES = 25
COLUMNS = espaciais_1 + espaciais_2

df = pd.read_csv(FILE)
length = len(df)

header = f"metric: {METRIC}, eps: {EPS}, min_samples: {MIN_SAMPLES}"
print(header)
print("-" * len(header))

X_train = df[COLUMNS].values

model = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, metric=METRIC)
model.fit(X_train)

# Number of clusters in labels, ignoring noise if present
labels = model.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
score = metrics.silhouette_score(X_train, labels)

title = f"n_clusters: {n_clusters}, n_noise: {n_noise}, score: {score:.3f}"

print(f"Columns: {columns}")
print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")
print(f"Silhouette coefficient: {score:.3f}\n")

metric: euclidean, eps: 0.5, min_samples: 25
--------------------------------------------
Columns: ['latitude', 'longitude']
Estimated number of clusters: 128
Estimated number of noise points: 1368
Silhouette coefficient: 0.499



In [10]:
# Save dataset with cluster column to csv
# --------------------------------------------------------------------------------------

df = pd.read_csv("../data/c-data-preparation/dataset.csv")
df["cluster"] = model.labels_
df.to_csv("../data/d-modeling/clustered.csv", index=False)