In [None]:
# MODELING
# --------------------------------------------------------------------------------------

# Select modeling techniques: Determine which algorithms to try (e.g. regression, neural
# net).
# 
# Generate test design: Pending your modeling approach, you might need to split the data
# into training, test, and validation sets.
# 
# Build model: As glamorous as this might sound, this might just be executing a few
# lines of code like “reg = LinearRegression().fit(X, y)”.
# 
# Assess model: Generally, multiple models are competing against each other, and the
# data scientist needs to interpret the model results based on domain knowledge, the
# pre-defined success criteria, and the test design.

In [None]:
# CONCLUSIONS
# --------------------------------------------------------------------------------------

# - Big cities are more prone to accidents (ofc, more people, more cars, more accidents)
# - Some clusters could be merged, this is specially true for clusters in cities
# - Liked the idea of removing noise (-1) from the data, but not sure if it's actually
# noise (check original data). IMO these sparse points are not relevant for the analysis
# 
# - Filter our some results, so there are less to analyze
# - Minimum number of clusters: 15
# - Maximum number of clusters: sqrt(n_samples) = 140
# 
# - We want accidents close to each other: small eps
# - We want clusters with many accidents: high min_samples
# 
# - If min_samples is too high, too few cluster (only around big cities)
# - If min_samples is too low, too much clusters
# - If eps is too big (0.75), too big clusters + too few n_noise
# - If eps is too small, too small clusters

In [10]:
import os
import itertools
import shutil

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn import metrics
from sklearn.cluster import DBSCAN

pd.set_option("display.max_columns", None)
YEAR = "2021"

shutil.rmtree(f"../output/d-modeling")
os.makedirs(f"../output/d-modeling", exist_ok=True)
os.makedirs(f"../data/d-modeling", exist_ok=True)

file = f"../data/c-data-preparation/{YEAR}-raw.csv"
raw = pd.read_csv(file)

file = f"../data/c-data-preparation/{YEAR}-preprocessed.csv"
preprocessed = pd.read_csv(file)

In [12]:
# Group features
# --------------------------------------------------------------------------------------

# geograficas: latitude, longitude, uf, br, km, municipio
# caracteristicas: condicao_metereologica, causa_acidente, tipo_acidente, classificacao_acidente
# estatisticas: pessoas, mortos, feridos_leves, feridos_graves, ilesos, ignorados, feridos, veiculos
# temporais: ano, mes, dia, hora, minuto, dia_semana, fase_dia
# pista: sentido_via, tipo_pista, tracado_via, uso_solo
# prf: regional, delegacia, uop

# espaciais = ["latitude", "longitude", "br", "km", "municipio"]
espaciais = ["latitude", "longitude", "br", "km"]

# Generate and chain all combinations using a list comprehension
combinations = [itertools.combinations(espaciais, r) for r in range(1, len(espaciais) + 1)]
combinations = list(itertools.chain.from_iterable(combinations))

# Filter out combinations that do not have both "latitude" and "longitude"
combinations = [item for item in combinations if "latitude" in item and "longitude" in item]

# Convert sets to lists
combinations = [list(item) for item in combinations]

combinations = [
    ["latitude", "longitude"],
    ["latitude", "longitude", "municipio"],
    ["latitude", "longitude", "br", "km", "municipio"],
]

for i, item in enumerate(combinations):
    print(f"{i + 1}: {item}")

1: ['latitude', 'longitude']
2: ['latitude', 'longitude', 'br', 'km', 'municipio']


In [3]:
# Plotting clusters on a map
# --------------------------------------------------------------------------------------

def plot(df, labels, dst, title):
    num_clusters = len(np.unique(labels))
    color_scale = ["hsl(" + str(h) + ",50%" + ",50%)" for h in np.linspace(0, 360, num_clusters)]

    fig = go.Figure()

    for cluster_label, color in zip(np.unique(labels), color_scale):
        # Skip noise
        if cluster_label == -1:
            continue

        cluster_mask = labels == cluster_label
        cluster_data = df[cluster_mask]

        scatter = go.Scattermapbox(
            lat=cluster_data.latitude,
            lon=cluster_data.longitude,
            mode="markers",
            marker=dict(size=7.5, color=color),
            # text=f"({cluster_label}) {cluster_data.causa_acidente}",
            text=f"{cluster_label}",
            showlegend=False,
        )

        fig.add_trace(scatter)

    fig.update_layout(
        title=title,
        title_x=0.95,
        title_y=0.1,
        mapbox_style="open-street-map",
        mapbox=dict(center=dict(lat=-28, lon=-52), zoom=5.5),
        margin={"l": 0, "r": 0, "t": 0, "b": 0},
    )

    # fig.show()
    fig.write_html(f"{dst}.html")

In [13]:
# Build models
# --------------------------------------------------------------------------------------

METRIC = "euclidean"
EPS = [0.1]
MIN_SAMPLES = [100, 250]
COLUMNS = combinations

length = len(preprocessed)
minimum = 15
maximum = int(np.sqrt(length))

hyperparameters = itertools.product(EPS, MIN_SAMPLES, COLUMNS)
for i, (eps, min_samples, columns) in enumerate(hyperparameters):
    header = f"metric: {METRIC}, eps: {eps}, min_samples: {min_samples}"
    print(header)
    print("-" * len(header))
    print(f"Columns: {columns}")

    X_train = preprocessed[columns].values

    model = DBSCAN(eps=eps, min_samples=min_samples, metric=METRIC)
    model.fit(X_train)

    # Number of clusters in labels, ignoring noise if present.
    labels = model.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    
    if not (minimum <= n_clusters <= maximum):
        print(f"Number of clusters is outside allowed range")
        print(f"Skipping, condition not met: {minimum} <= {n_clusters} <= {maximum}\n")
        continue

    score = metrics.silhouette_score(X_train, labels)

    dst = f"../output/d-modeling/{METRIC}-{eps:.2f}-{min_samples}-{i}"
    title = (
        f"n_clusters: {n_clusters}, n_noise: {n_noise}, score: {score:.3f}<br>"
        f"columns: {columns}"
    )

    print(f"Estimated number of clusters: {n_clusters}")
    print(f"Estimated number of noise points: {n_noise}")
    print(f"Silhouette coefficient: {score:.3f}\n")

    plot(raw, labels, dst, title)

metric: euclidean, eps: 0.1, min_samples: 100
---------------------------------------------
Columns: ['latitude', 'longitude']
Number of clusters is outside allowed range
Skipping, condition not met: 15 <= 1 <= 140

metric: euclidean, eps: 0.1, min_samples: 100
---------------------------------------------
Columns: ['latitude', 'longitude', 'br', 'km', 'municipio']
Estimated number of clusters: 48
Estimated number of noise points: 7645
Silhouette coefficient: 0.244

metric: euclidean, eps: 0.1, min_samples: 250
---------------------------------------------
Columns: ['latitude', 'longitude']
Number of clusters is outside allowed range
Skipping, condition not met: 15 <= 1 <= 140

metric: euclidean, eps: 0.1, min_samples: 250
---------------------------------------------
Columns: ['latitude', 'longitude', 'br', 'km', 'municipio']
Number of clusters is outside allowed range
Skipping, condition not met: 15 <= 13 <= 140



In [None]:
# Build final model: trechos
# --------------------------------------------------------------------------------------

# - adding br, km and municipio helps on diversity of clusters
# - eps around 0.1 looks a good choice
# - min_samples around 50 looks a good choice

EPS = 0.1
MIN_SAMPLES = 50
COLUMNS = ["latitude", "longitude", "br", "km", "municipio"]
METRIC = "euclidean"

length = len(preprocessed)

header = f"metric: {METRIC}, eps: {EPS}, min_samples: {MIN_SAMPLES}"
print(header)
print("-" * len(header))

X_train = preprocessed[COLUMNS].values

model = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, metric=METRIC)
model.fit(X_train)

# Number of clusters in labels, ignoring noise if present
labels = model.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
score = metrics.silhouette_score(X_train, labels)

title = f"n_clusters: {n_clusters}, n_noise: {n_noise}, score: {score:.3f}"

print(f"Columns: {COLUMNS}")
print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")
print(f"Silhouette coefficient: {score:.3f}\n")

# Save dataset with cluster column to csv
# --------------------------------------------------------------------------------------

raw["cluster"] = model.labels_
raw.to_csv(f"../data/d-modeling/{YEAR}-trechos.csv", index=False)

In [None]:
# Build final model: regioes
# --------------------------------------------------------------------------------------

# NOTE: avoid duplicated code here

EPS = None
MIN_SAMPLES = None
COLUMNS = None
METRIC = "euclidean"

length = len(preprocessed)

header = f"metric: {METRIC}, eps: {EPS}, min_samples: {MIN_SAMPLES}"
print(header)
print("-" * len(header))

X_train = preprocessed[COLUMNS].values

model = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, metric=METRIC)
model.fit(X_train)

# Number of clusters in labels, ignoring noise if present
labels = model.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
score = metrics.silhouette_score(X_train, labels)

title = f"n_clusters: {n_clusters}, n_noise: {n_noise}, score: {score:.3f}"

print(f"Columns: {columns}")
print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")
print(f"Silhouette coefficient: {score:.3f}\n")

# Save dataset with cluster column to csv
# --------------------------------------------------------------------------------------

raw["cluster"] = model.labels_
raw.to_csv(f"../data/d-modeling/{YEAR}-regioes.csv", index=False)