In [None]:
# MODELING
# --------------------------------------------------------------------------------------

# Select modeling techniques: Determine which algorithms to try (e.g. regression, neural
# net).
# 
# Generate test design: Pending your modeling approach, you might need to split the data
# into training, test, and validation sets.
# 
# Build model: As glamorous as this might sound, this might just be executing a few
# lines of code like “reg = LinearRegression().fit(X, y)”.
# 
# Assess model: Generally, multiple models are competing against each other, and the
# data scientist needs to interpret the model results based on domain knowledge, the
# pre-defined success criteria, and the test design.

In [None]:
# CONCLUSIONS
# --------------------------------------------------------------------------------------

# - ...

In [2]:
import os
import itertools
import shutil

import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn import metrics
from sklearn.cluster import DBSCAN

pd.set_option("display.max_columns", None)
YEAR = "2021"

shutil.rmtree(f"../output/d-modeling")
os.makedirs(f"../output/d-modeling", exist_ok=True)
os.makedirs(f"../data/d-modeling", exist_ok=True)

file = f"../data/c-data-preparation/{YEAR}-raw.csv"
raw = pd.read_csv(file)

file = f"../data/c-data-preparation/{YEAR}-preprocessed.csv"
preprocessed = pd.read_csv(file)

In [28]:
# Plotting clusters on a map
# --------------------------------------------------------------------------------------

def plot(df, labels, dst, title):
    num_clusters = len(np.unique(labels))
    color_scale = ["hsl(" + str(h) + ",50%" + ",50%)" for h in np.linspace(0, 360, num_clusters)]

    fig = go.Figure()

    for cluster_label, color in zip(np.unique(labels), color_scale):
        # Skip noise
        if cluster_label == -1:
            continue

        cluster_mask = labels == cluster_label
        cluster_data = df[cluster_mask]

        scatter = go.Scattermapbox(
            lat=cluster_data.latitude,
            lon=cluster_data.longitude,
            mode="markers",
            marker=dict(size=7.5, color=color),
            # text=f"({cluster_label}) {cluster_data.causa_acidente}",
            text=f"{cluster_label}",
            showlegend=False,
        )

        fig.add_trace(scatter)

    fig.update_layout(
        title=title,
        title_x=0.95,
        title_y=0.1,
        mapbox_style="open-street-map",
        mapbox=dict(center=dict(lat=-28, lon=-52), zoom=5.5),
        margin={"l": 0, "r": 0, "t": 0, "b": 0},
    )

    # fig.show()
    fig.write_html(f"{dst}.html")

In [11]:
# Elbow method to determine the optimal number of clusters
# --------------------------------------------------------------------------------------

import plotly.graph_objects as go
import plotly.subplots as sp
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score

length = len(preprocessed)
kmax = int(np.sqrt(length))
kmin = 2
ks = range(kmin, kmax)
columns = ["latitude", "longitude"]
X = preprocessed[columns]

inertia = []
silhouette = []

for i, k in enumerate(ks):
    print(f"[{i + 1}] n_clusters: {k}")

    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X)
    
    inertia.append(model.inertia_)
    silhouette.append(silhouette_score(X, model.labels_))

[1] n_clusters: 2
[2] n_clusters: 3
[3] n_clusters: 4
[4] n_clusters: 5
[5] n_clusters: 6
[6] n_clusters: 7
[7] n_clusters: 8
[8] n_clusters: 9
[9] n_clusters: 10
[10] n_clusters: 11
[11] n_clusters: 12
[12] n_clusters: 13
[13] n_clusters: 14
[14] n_clusters: 15
[15] n_clusters: 16
[16] n_clusters: 17
[17] n_clusters: 18
[18] n_clusters: 19
[19] n_clusters: 20
[20] n_clusters: 21
[21] n_clusters: 22
[22] n_clusters: 23
[23] n_clusters: 24
[24] n_clusters: 25
[25] n_clusters: 26
[26] n_clusters: 27
[27] n_clusters: 28
[28] n_clusters: 29
[29] n_clusters: 30
[30] n_clusters: 31
[31] n_clusters: 32
[32] n_clusters: 33
[33] n_clusters: 34
[34] n_clusters: 35
[35] n_clusters: 36
[36] n_clusters: 37
[37] n_clusters: 38
[38] n_clusters: 39
[39] n_clusters: 40
[40] n_clusters: 41
[41] n_clusters: 42
[42] n_clusters: 43
[43] n_clusters: 44
[44] n_clusters: 45
[45] n_clusters: 46
[46] n_clusters: 47
[47] n_clusters: 48
[48] n_clusters: 49
[49] n_clusters: 50
[50] n_clusters: 51
[51] n_clusters: 

In [27]:
# Elbow method to determine the optimal number of clusters
# --------------------------------------------------------------------------------------

# Create subplots using make_subplots
titles = ("Inertia vs Number of Clusters", "Silhouette Score vs Number of Clusters")
fig = sp.make_subplots(rows=1, cols=2, subplot_titles=titles)

# Plotting inertia values
trace = go.Scatter(x=list(ks), y=inertia, mode="lines+markers", name="Inertia")
fig.add_trace(trace, row=1, col=1)

# Plotting silhouette scores
trace = go.Scatter(x=list(ks), y=silhouette, mode="lines+markers", name="Silhouette Score")
fig.add_trace(trace, row=1, col=2)

fig.update_xaxes(title_text="Number of Clusters", row=1, col=1)
fig.update_yaxes(title_text="Inertia", row=1, col=1)
fig.update_xaxes(title_text="Number of Clusters", row=1, col=2)
fig.update_yaxes(title_text="Silhouette Score", row=1, col=2)
fig.update_layout(title="K-Means Clustering Evaluation: Elbow Method", showlegend=False)

fig.show()

In [25]:
# Build top 3 models according to silhouette score
# --------------------------------------------------------------------------------------

ks = [48, 57, 70]
for i, k in enumerate(ks):
    header = f"k: {k}"
    print(header)
    print("-" * len(header))
    print(f"Columns: {columns}")

    X = preprocessed[columns].values

    model = KMeans(n_clusters=k, random_state=42)
    model.fit(X)

    # Number of clusters in labels, ignoring noise if present.
    labels = model.labels_
    score = metrics.silhouette_score(X, labels)

    dst = f"../output/d-modeling/knn-{k}-{i}"
    title = (
        f"k: {k}, score: {score:.3f}<br>"
        f"columns: {columns}"
    )

    print(f"Silhouette coefficient: {score:.3f}\n")
    plot(raw, labels, dst, title)

k: 48
-----
Columns: ['latitude', 'longitude']
Silhouette coefficient: 0.572

k: 57
-----
Columns: ['latitude', 'longitude']
Silhouette coefficient: 0.575

k: 70
-----
Columns: ['latitude', 'longitude']
Silhouette coefficient: 0.579



In [None]:
# TODO: Build final model: regioes
# --------------------------------------------------------------------------------------

EPS = None
MIN_SAMPLES = None
COLUMNS = None
METRIC = "euclidean"

length = len(preprocessed)

header = f"metric: {METRIC}, eps: {EPS}, min_samples: {MIN_SAMPLES}"
print(header)
print("-" * len(header))

X_train = preprocessed[COLUMNS].values

model = DBSCAN(eps=EPS, min_samples=MIN_SAMPLES, metric=METRIC)
model.fit(X_train)

# Number of clusters in labels, ignoring noise if present
labels = model.labels_
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
score = metrics.silhouette_score(X_train, labels)

title = f"n_clusters: {n_clusters}, n_noise: {n_noise}, score: {score:.3f}"

print(f"Columns: {columns}")
print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")
print(f"Silhouette coefficient: {score:.3f}\n")

# Save dataset with cluster column to csv
# --------------------------------------------------------------------------------------

raw["cluster"] = model.labels_
raw.to_csv(f"../data/d-modeling/{YEAR}-regioes.csv", index=False)