In [None]:
# EVALUATION
# --------------------------------------------------------------------------------------

# Evaluate results: Do the models meet the business success criteria? Which one(s)
# should we approve for the business?
# 
# Review process: Review the work accomplished. Was anything overlooked? Were all steps
# properly executed? Summarize findings and correct anything if needed.
# 
# Determine next steps: Based on the previous three tasks, determine whether to proceed
# to deployment, iterate further, or initiate new projects.

In [None]:
# CONCLUSIONS
# --------------------------------------------------------------------------------------

# - ...

In [38]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

YEAR = "2021"

os.makedirs(f"../output/e-evaluation", exist_ok=True)
os.makedirs(f"../data/e-evaluation", exist_ok=True)

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)

In [39]:
# Read data and convert to the appropriate types
# --------------------------------------------------------------------------------------

file = f"../data/d-modeling/{YEAR}-regioes.csv"

dtypes = {
    # "id": "Int64",
    # "data_inversa": "datetime64[ns]",
    "dia_semana": "category",
    # "horario": "datetime64[ns]",
    "uf": "category",
    "br": "category",
    "km": "float64",
    "municipio": "category",
    "causa_acidente": "category",
    "tipo_acidente": "category",
    "classificacao_acidente": "category",
    "fase_dia": "category",
    "sentido_via": "category",
    "condicao_metereologica": "category",
    "tipo_pista": "category",
    "tracado_via": "category",
    "uso_solo": "category",
    "pessoas": "Int64",
    "mortos": "Int64",
    "feridos_leves": "Int64",
    "feridos_graves": "Int64",
    "ilesos": "Int64",
    "ignorados": "Int64",
    "feridos": "Int64",
    "veiculos": "Int64",
    "latitude": "float64",
    "longitude": "float64",
    "regional": "category",
    "delegacia": "category",
    "uop": "category",
    "ano": "Int64",
    "dia": "Int64",
    "mes": "Int64",
    "hora": "Int64",
    "minuto": "Int64",
}

df = pd.read_csv(file)
df = df.astype(dtypes)

print(df.head())

    dia_semana  uf     br     km              municipio                                     causa_acidente              tipo_acidente classificacao_acidente     fase_dia  sentido_via condicao_metereologica tipo_pista    tracado_via uso_solo  pessoas  mortos  feridos_leves  feridos_graves  ilesos  ignorados  feridos  veiculos   latitude  longitude regional delegacia             uop   ano  mes  dia  hora  minuto  cluster
0  sexta-feira  PR  277.0   51.3   SAO JOSE DOS PINHAIS                                 Pista Escorregadia  Saída de leito carroçável    Com Vítimas Feridas    Pleno dia  Decrescente         Garoa/Chuvisco      Dupla          Curva      Não        4       0              1               0       3          0        1         3 -25.595160 -48.907008  SPRF-PR  DEL01-PR  UOP05-DEL01-PR  2021    1    1    15      45       30
1  sexta-feira  SC  470.0   79.1                INDAIAL                             Transitar na contramão            Colisão frontal     Com Vítimas Fata

In [40]:
# Compute severity of each row
# --------------------------------------------------------------------------------------

# TODO: add this to preprocessing

# Define a function to determine the severity based on the given conditions
def get_severity(row):
    if row.mortos > 0:
        return 13
    elif row.feridos_graves > 0:
        return 7
    elif row.feridos_leves > 0:
        return 3
    elif row.ilesos > 0:
        return 1
    if row.classificacao_acidente == "Com Vítimas Fatais":
        return 13
    elif row.classificacao_acidente == "Com Vítimas Feridas":
        return 5
    elif row.classificacao_acidente == "Sem Vítimas":
        return 1
    else:
        return 0

# Apply the function to create the "severidade" column
df["severidade"] = df.apply(get_severity, axis=1)
df.severidade.value_counts()

3     10999
7      3832
1      3739
13     1061
Name: severidade, dtype: int64

In [61]:
# Compute cluster metrics
# --------------------------------------------------------------------------------------

# Separate columns by data type
numericals = df._get_numeric_data().columns
categoricals = set(df.columns) - set(numericals)

# Define aggregation functions based on column type
aggregation = {col: "mean" for col in numericals if col != "cluster"}
aggregation.update({col: lambda x: x.mode().iloc[0] for col in categoricals})

# Apply aggregation functions
clusters = df.groupby("cluster")
metrics = clusters.agg(aggregation).reset_index()

metrics["size"] = clusters.size().reset_index(name="size")["size"]

# Drop columns that won't be investigated
to_drop = ["latitude", "longitude", "ano", "km", "br", "municipio", "sentido_via", "uso_solo", "regional", "delegacia", "uop", "pessoas", "mortos", "feridos_leves", "feridos_graves", "ilesos", "ignorados", "feridos", "veiculos", "minuto", "uf", "classificacao_acidente"]
metrics = metrics.drop(columns=to_drop)

# Sort metrics
# ======================================================================================
# NOTE: normalize by severidade / kilometers
kms = (
    df.groupby(["cluster", "br"])["km"]
    .apply(lambda x: abs(x.max() - x.min()))
    .groupby("cluster")
    .sum()
    .reset_index(name="kilometers")
)

metrics["kilometers"] = kms.sort_values(by="kilometers", ascending=False).kilometers
metrics = metrics.assign(ratio=lambda x: x.severidade / x.kilometers)
metrics = metrics.sort_values(by="ratio", ascending=False)

# NOTE: normalize by severidade
# metrics = metrics.sort_values(by="severidade", ascending=False)
# ======================================================================================

# Sort the columns: numericals first, categoricals later
numericals = metrics._get_numeric_data().columns
categoricals = set(metrics.columns) - set(numericals)
columns = list(numericals) + list(categoricals)
metrics = metrics[columns]

top3 = metrics.cluster.head(3).to_list()
top6 = metrics.cluster.head(6).to_list()

print(metrics.head(10))

    cluster       mes        dia       hora  severidade  size  kilometers     ratio   dia_semana condicao_metereologica tipo_pista   fase_dia tracado_via                                     causa_acidente              tipo_acidente
29       29  6.520408  14.908163  13.081633    4.367347    98       111.0  0.039345  sexta-feira              Céu Claro      Dupla  Pleno dia        Reta           Reação tardia ou ineficiente do condutor  Saída de leito carroçável
15       15  6.335593  14.715254  13.015254    3.976271   590       110.5  0.035984       sábado              Céu Claro      Dupla  Pleno dia        Reta           Reação tardia ou ineficiente do condutor           Colisão traseira
46       46  6.239787  15.596803  13.420959    4.172291   563       124.1  0.033620  sexta-feira              Céu Claro    Simples  Pleno dia        Reta                     Ausência de reação do condutor        Colisão transversal
47       47  6.635514  16.495327  12.757009    4.214953   107       137.

In [52]:
# Sort metrics
# --------------------------------------------------------------------------------------

# NOTE: normalize by severidade / kilometers
# ======================================================================================
kms = (
    # Group by "cluster" and "br" and selects the "km" column for further operations
    df.groupby(["cluster", "br"])["km"]
    # Within each group (combination of "cluster" and "br"), compute the absolute 
    # difference between the maximum and minimum values of the "km" column
    .apply(lambda x: abs(x.max() - x.min()))
    # Group the resulting data by the "cluster" column alone and calculates the sum of
    # the kilometers within each cluster
    .groupby("cluster")
    .sum()
    .reset_index(name="kilometers")
)

metrics["kilometers"] = kms.sort_values(by="kilometers", ascending=False).kilometers
metrics = metrics.assign(ratio=lambda x: x.severidade / x.kilometers)
metrics = metrics.sort_values(by="ratio", ascending=False)

# NOTE: normalize by severidade
# ======================================================================================
# metrics = metrics.sort_values(by="severidade", ascending=False)

# Sort the columns and concatenate them
sorted_columns = list(numericals) + list(categoricals)
metrics = metrics[sorted_columns]

top3 = metrics.cluster.head(3).to_list()
top6 = metrics.cluster.head(6).to_list()

print(metrics.head(10))

    cluster       mes        dia       hora  severidade  size  kilometers     ratio   dia_semana condicao_metereologica tipo_pista   fase_dia tracado_via                                     causa_acidente              tipo_acidente
29       29  6.520408  14.908163  13.081633    4.367347    98       111.0  0.039345  sexta-feira              Céu Claro      Dupla  Pleno dia        Reta           Reação tardia ou ineficiente do condutor  Saída de leito carroçável
15       15  6.335593  14.715254  13.015254    3.976271   590       110.5  0.035984       sábado              Céu Claro      Dupla  Pleno dia        Reta           Reação tardia ou ineficiente do condutor           Colisão traseira
46       46  6.239787  15.596803  13.420959    4.172291   563       124.1  0.033620  sexta-feira              Céu Claro    Simples  Pleno dia        Reta                     Ausência de reação do condutor        Colisão transversal
47       47  6.635514  16.495327  12.757009    4.214953   107       137.

In [None]:
# OVERVIEW
# --------------------------------------------------------------------------------------

# - This table provides an overview of the high risk areas
# - We can see that the conditions in which accidents occurs are similar
# - From now on, top 3 clusters will be investigate in more detail

In [65]:
# Plotting clusters on a map
# --------------------------------------------------------------------------------------

import plotly.graph_objects as go

cluster_labels = top6
num_clusters = len(cluster_labels)

labels = df.cluster
color_scale = ["hsl(" + str(h) + ",50%" + ",50%)" for h in np.linspace(0, 360, num_clusters)]
fig = go.Figure()

for cluster_label, color in zip(cluster_labels, color_scale):
    # Skip noise
    if cluster_label == -1:
        continue

    cluster_mask = labels == cluster_label
    cluster_data = df[cluster_mask]

    scatter = go.Scattermapbox(
        lat=cluster_data.latitude,
        lon=cluster_data.longitude,
        mode="markers",
        marker=dict(size=7.5, color=color),
        # text=f"({cluster_label}) {cluster_data.causa_acidente}",
        text=f"{cluster_label}",
        showlegend=False,
    )

    fig.add_trace(scatter)

fig.update_layout(
    mapbox_style="open-street-map",
    mapbox=dict(center=dict(lat=-28, lon=-52), zoom=4.5),
    margin={"l": 0, "r": 0, "t": 0, "b": 0},
)

fig.show()

In [68]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

titles = [f"Cluster {cluster_num}" for cluster_num in top6]
fig = make_subplots(rows=2, cols=3, subplot_titles=titles, vertical_spacing = 0.35)

for i, cluster_num in enumerate(top6):
    cluster = df[df.cluster == cluster_num]
    crosstab = pd.crosstab(cluster.fase_dia, cluster.condicao_metereologica)

    heatmap = go.Heatmap(
        z=crosstab.values,
        x=crosstab.columns,
        y=crosstab.index,
        text=crosstab.values,
        colorscale='Viridis'
    )
    fig.add_trace(heatmap, row=(i // 3) + 1, col=(i % 3) + 1)

fig.update_layout(
    title="Crosstabulation Heatmaps",
    # xaxis=dict(title="condicao_metereologica"),
    # yaxis=dict(title="fase_dia"),
    showlegend=False,
)

fig.show()


In [69]:
titles = [f"Cluster {cluster_num}" for cluster_num in top6]
fig = make_subplots(rows=2, cols=3, subplot_titles=titles, vertical_spacing = 0.5)

color = "rgb(31, 119, 180)"

for i, cluster_num in enumerate(top6):
    cluster = df[df.cluster == cluster_num]
    top_acidentes = cluster['tipo_acidente'].value_counts().head(5)  # Get top 5 types of accidents
    
    bar_plot = go.Bar(
        x=top_acidentes.index,
        y=top_acidentes.values,
        text=top_acidentes.values,
        marker_color=color,
    )
    
    fig.add_trace(bar_plot, row=(i // 3) + 1, col=(i % 3) + 1)

fig.update_layout(
    # title="Top 5 Tipo de Acidente",
    # xaxis=dict(title="Tipo de Acidente"),
    # yaxis=dict(title="Número de Ocorrências"),
    showlegend=False,
)

fig.show()

In [55]:
# Later
# --------------------------------------------------------------------------------------

# https://plotly.com/python/colorscales/

import plotly.express as px
df = px.data.iris()
fig = px.parallel_coordinates(df, color="species_id",
                             color_continuous_scale=[(0.00, "red"),   (0.33, "red"),
                                                     (0.33, "green"), (0.66, "green"),
                                                     (0.66, "blue"),  (1.00, "blue")])
fig.show()