In [31]:
# EVALUATION
# --------------------------------------------------------------------------------------

# Evaluate results: Do the models meet the business success criteria? Which one(s)
# should we approve for the business?
# 
# Review process: Review the work accomplished. Was anything overlooked? Were all steps
# properly executed? Summarize findings and correct anything if needed.
# 
# Determine next steps: Based on the previous three tasks, determine whether to proceed
# to deployment, iterate further, or initiate new projects.

In [32]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import utils

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)

os.makedirs(f"../output/e-evaluation", exist_ok=True)
os.makedirs(f"../data/e-evaluation", exist_ok=True)

YEAR = "2022"

In [33]:
# Read data and convert to the appropriate types
# --------------------------------------------------------------------------------------

file = f"../data/d-modeling/{YEAR}-regioes.csv"

dtypes = {
    # "id": "Int64",
    # "data_inversa": "datetime64[ns]",
    "dia_semana": "category",
    # "horario": "datetime64[ns]",
    "uf": "category",
    "br": "category",
    "km": "float64",
    "municipio": "category",
    "causa_acidente": "category",
    "tipo_acidente": "category",
    "classificacao_acidente": "category",
    "fase_dia": "category",
    "sentido_via": "category",
    "condicao_metereologica": "category",
    "tipo_pista": "category",
    "tracado_via": "category",
    "uso_solo": "category",
    "pessoas": "Int64",
    "mortos": "Int64",
    "feridos_leves": "Int64",
    "feridos_graves": "Int64",
    "ilesos": "Int64",
    "ignorados": "Int64",
    "feridos": "Int64",
    "veiculos": "Int64",
    "latitude": "float64",
    "longitude": "float64",
    "regional": "category",
    "delegacia": "category",
    "uop": "category",
    "ano": "Int64",
    "dia": "Int64",
    "mes": "Int64",
    "hora": "Int64",
    "minuto": "Int64",
}

df = pd.read_csv(file)
df = df.astype(dtypes)

print(df.head())

  dia_semana  uf     br     km              municipio                                     causa_acidente                   tipo_acidente classificacao_acidente   fase_dia  sentido_via condicao_metereologica tipo_pista    tracado_via uso_solo  pessoas  mortos  feridos_leves  feridos_graves  ilesos  ignorados  feridos  veiculos   latitude  longitude regional delegacia             uop   ano  mes  dia  hora  minuto  cluster
0     sábado  PR  116.0   33.0  CAMPINA GRANDE DO SUL                   Ingestão de álcool pelo condutor                      Tombamento     Com Vítimas Fatais  Pleno dia  Decrescente                Nublado      Dupla          Curva      Não        2       1              1               0       0          0        1         1 -25.114403 -48.846755  SPRF-PR  DEL01-PR  UOP02-DEL01-PR  2022    1    1     2      40       27
1     sábado  SC  163.0   80.1             GUARACIABA                     Ausência de reação do condutor                 Colisão frontal     Com Vítimas

In [34]:
# Categorize high cardinality features + severidade
# --------------------------------------------------------------------------------------

# TODO: add this to preprocessing

df.causa_acidente = df.causa_acidente.apply(utils.categorizar_causa)
# df.tipo_acidente = df.tipo_acidente.apply(utils.categorizar_tipo)
df["severidade"] = df.apply(utils.categorizar_severidade, axis=1)

In [35]:
# OVERVIEW
# ======================================================================================

In [36]:
# Plotting clusters on a map
# --------------------------------------------------------------------------------------

dst = f"../output/e-evaluation/{YEAR}-clusters-map"
utils.plot_map(df, np.unique(df.cluster), dst, None)

In [37]:
# Compute cluster metrics
# --------------------------------------------------------------------------------------

# Separate columns by data type
numericals = df._get_numeric_data().columns
categoricals = set(df.columns) - set(numericals)

# Define aggregation functions based on column type
aggregation = {col: "sum" if col == "severidade" else "mean" for col in numericals if col != "cluster"}
aggregation.update({col: lambda x: x.mode().iloc[0] for col in categoricals})

# Apply aggregation functions
clusters = df.groupby("cluster")
metrics = clusters.agg(aggregation).reset_index()

metrics["size"] = clusters.size().reset_index(name="size")["size"]

# Drop columns that won't be investigated
to_drop = ["latitude", "longitude", "ano", "mes", "dia", "hora", "km", "br", "municipio", "sentido_via", "uso_solo", "regional", "delegacia", "uop", "pessoas", "mortos", "feridos_leves", "feridos_graves", "ilesos", "ignorados", "feridos", "veiculos", "minuto", "uf", "classificacao_acidente"]
metrics = metrics.drop(columns=to_drop)

# Sort metrics
# ======================================================================================
# NOTE: normalize by severidade / kilometers
kms = (
    # Group by "cluster" and "br" and selects the "km" column for further operations
    df.groupby(["cluster", "br"])["km"]
    # Within each group (combination of "cluster" and "br"), compute the absolute 
    # difference between the maximum and minimum values of the "km" column
    .apply(lambda x: abs(x.max() - x.min()))
    # Group the resulting data by the "cluster" column alone and calculates the sum of
    # the kilometers within each cluster
    .groupby("cluster")
    .sum()
    .reset_index(name="kilometers")
)

metrics["kilometers"] = kms.sort_values(by="kilometers", ascending=False).kilometers
metrics = metrics.assign(ratio=lambda x: x.severidade / x.kilometers)
metrics = metrics.sort_values(by="ratio", ascending=False)

# NOTE: normalize by severidade
# metrics = metrics.sort_values(by="severidade", ascending=False)
# ======================================================================================

# Sort the columns: numericals first, categoricals later
numericals = metrics._get_numeric_data().columns
categoricals = set(metrics.columns) - set(numericals)
columns = list(numericals) + list(categoricals)
metrics = metrics[columns]

top3 = metrics.cluster.head(3).to_list()
top6 = metrics.cluster.head(6).to_list()

c1 = df[df.cluster == top6[0]]
c2 = df[df.cluster == top6[1]]
c3 = df[df.cluster == top6[2]]
c4 = df[df.cluster == top6[3]]
c5 = df[df.cluster == top6[4]]
c6 = df[df.cluster == top6[5]]

dst = f"../output/e-evaluation/{YEAR}-clusters-metrics.csv"
metrics.to_csv(dst, index=False)

print(metrics.head(10))

    cluster  severidade  size  kilometers      ratio     dia_semana tracado_via              tipo_acidente condicao_metereologica   fase_dia                               causa_acidente tipo_pista
4         4        5420  1368       188.6  28.738070         sábado        Reta           Colisão traseira              Céu Claro  Pleno dia  Comportamento do condutor: direção perigosa      Dupla
18       18        6782  1776       350.4  19.355023    sexta-feira        Reta           Colisão traseira              Céu Claro  Pleno dia  Comportamento do condutor: direção perigosa   Múltipla
25       25        2488   572       136.5  18.227106         sábado        Reta        Colisão transversal              Céu Claro  Pleno dia  Comportamento do condutor: direção perigosa    Simples
29       29        3555   917       197.7  17.981791        domingo        Reta        Colisão transversal              Céu Claro  Pleno dia  Comportamento do condutor: direção perigosa      Dupla
34       34    

In [38]:
# Plotting clusters on a map
# --------------------------------------------------------------------------------------

dst = f"../output/e-evaluation/{YEAR}-clusters-map-top3"
utils.plot_map(df, top3, dst, None)

In [39]:
# EXPLORING TOP 3
# ======================================================================================

In [40]:
# Set dimensions for parallel plot and color palette
# --------------------------------------------------------------------------------------

def plot_parallel_categories(cluster, dimensions, colorscale):

    tmp = cluster.copy()
    # TODO: group others or not?
    tmp = utils.group_others(cluster)
    tmp.severidade = tmp.severidade.apply(lambda x: 1 if x in [12, 7] else 0)

    fig = px.parallel_categories(
        tmp, 
        dimensions, 
        color="severidade", 
        color_continuous_scale=colorscale,
    )

    ratio = (1 + 5 ** 0.5) / 2
    width = 1200
    height = int(width / ratio)

    dst = f"../output/e-evaluation/{YEAR}-parallel-{cluster.cluster.iloc[0]}"
    fig.write_html(f"{dst}.html")
    fig.write_image(
        f"{dst}.png",
        format="png",
        engine="kaleido",
        width=width,
        height=height
    )

    fig.show()

dimensions = [
    "tipo_pista",
    "tracado_via",
    # "dia_semana",
    "condicao_metereologica",
    "fase_dia",
    # "causa_acidente",
    # "tipo_acidente",
]
colorscale = [[0, "lightsteelblue"], [1, "mediumseagreen"]]

In [41]:
# Parallel categories: c1
# --------------------------------------------------------------------------------------

plot_parallel_categories(c1, dimensions, colorscale)

In [42]:
# Parallel categories: c2
# --------------------------------------------------------------------------------------

plot_parallel_categories(c2, dimensions, colorscale)

In [43]:
# Parallel categories: c3
# --------------------------------------------------------------------------------------

plot_parallel_categories(c3, dimensions, colorscale)

In [44]:
# Stacked bar plot: tipo acidente
# --------------------------------------------------------------------------------------

import plotly.express as px

# Filter the DataFrame to include only rows with cluster values in top3
cluster = df[df.cluster.isin(top3)].copy()

# Order categories using top 1 cluster
category_orders = cluster[cluster["cluster"] == top3[0]]["tipo_acidente"].value_counts().sort_values(ascending=False).index.tolist()

# Group others
# cluster = group_others(cluster, 5)
# NOTE: can't grou, otherwise this plot is wrong, not all clusters have the same top categories!

# Categorize severidade
# cluster.severidade = cluster.severidade.apply(lambda x: "1" if x in [12, 7] else "0")
cluster.loc[:, "severidade"] = cluster.severidade.apply(lambda x: "1" if x in [12, 7] else "0")

# Group by cluster, tipo_acidente, and severidade, and calculate count
cluster = cluster.groupby(["cluster", "tipo_acidente", "severidade"]).size().reset_index(name="count")

# Order clusters based on top3 order
cluster = cluster.sort_values(by="cluster", key=lambda x: x.map({val: i for i, val in enumerate(top3)}))

# Create a facetted bar plot using px.bar
fig = px.bar(
    cluster,
    x="count",
    y="tipo_acidente",
    color="severidade",
    color_discrete_map={"0": "lightsteelblue", "1": "mediumseagreen"},
    orientation="h",
    facet_col="cluster",
    height=600,
    category_orders={"tipo_acidente": category_orders},
    labels={"count": "Frequência", "tipo_acidente": "Tipo do acidente"},
    # title="",
)

fig.for_each_annotation(lambda a: a.update(text=a.text.replace("=", " ").capitalize()))
fig.update_layout(margin=dict(l=0, r=0, t=60, b=0))

ratio = (1 + 5 ** 0.5) / 2
width = 1200
height = int(width / ratio)

dst = f"../output/e-evaluation/{YEAR}-tipo-acidente"
fig.write_html(f"{dst}.html")
fig.write_image(
    f"{dst}.png",
    format="png",
    engine="kaleido",
    width=width,
    height=height
)

fig.show()

In [45]:
# Stacked bar plot: causa acidente
# --------------------------------------------------------------------------------------

import plotly.express as px

# Filter the DataFrame to include only rows with cluster values in top3
cluster = df[df.cluster.isin(top3)].copy()

mapper = {
    "Comportamento do condutor: desrespeito às sinalizações": "Comportamento do condutor<br>desrespeito às sinalizações",
    "Comportamento do condutor: direção perigosa": "Comportamento do condutor<br>direção perigosa",
    "Comportamento do condutor: condições de saúde/consciência": "Comportamento do condutor<br>condições de saúde/consciência",
}
cluster.causa_acidente = cluster.causa_acidente.replace(mapper, regex=True)

# Order categories using top 1 cluster
category_orders = cluster[cluster["cluster"] == top3[0]]["causa_acidente"].value_counts().sort_values(ascending=False).index.tolist()

# Group others
# cluster = group_others(cluster, 5)
# NOTE: can't grou, otherwise this plot is wrong, not all clusters have the same top categories!

# Categorize severidade
cluster.severidade = cluster.severidade.apply(lambda x: "1" if x in [12, 7] else "0")

# Group by cluster, causa_acidente, and severidade, and calculate count
cluster = cluster.groupby(["cluster", "causa_acidente", "severidade"]).size().reset_index(name="count")

# Order clusters based on top3 order
cluster = cluster.sort_values(by="cluster", key=lambda x: x.map({val: i for i, val in enumerate(top3)}))

# <br>

# Create a facetted bar plot using px.bar
fig = px.bar(
    cluster,
    x="count",
    y="causa_acidente",
    color="severidade",
    color_discrete_map={"0": "lightsteelblue", "1": "mediumseagreen"},
    orientation="h",
    facet_col="cluster",
    height=600,
    category_orders={"causa_acidente": category_orders},
    labels={"count": "Frequência", "causa_acidente": "Causa do acidente"},
    # title="",
)

fig.for_each_annotation(lambda a: a.update(text=a.text.replace("=", " ").capitalize()))
fig.update_layout(margin=dict(l=0, r=0, t=60, b=0))

ratio = (1 + 5 ** 0.5) / 2
width = 1200
height = int(width / ratio)

dst = f"../output/e-evaluation/{YEAR}-causa-acidente"
fig.write_html(f"{dst}.html")
fig.write_image(
    f"{dst}.png",
    format="png",
    engine="kaleido",
    width=width,
    height=height
)

fig.show()