In [None]:
# EVALUATION
# --------------------------------------------------------------------------------------

# Evaluate results: Do the models meet the business success criteria? Which one(s)
# should we approve for the business?
# 
# Review process: Review the work accomplished. Was anything overlooked? Were all steps
# properly executed? Summarize findings and correct anything if needed.
# 
# Determine next steps: Based on the previous three tasks, determine whether to proceed
# to deployment, iterate further, or initiate new projects.

In [None]:
# CONCLUSIONS
# --------------------------------------------------------------------------------------

# - ...

In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

YEAR = "2021"

os.makedirs(f"../output/e-evaluation", exist_ok=True)
os.makedirs(f"../data/e-evaluation", exist_ok=True)

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)

In [2]:
# Read data and convert to the appropriate types
# --------------------------------------------------------------------------------------

file = f"../data/d-modeling/{YEAR}-regioes.csv"

dtypes = {
    # "id": "Int64",
    # "data_inversa": "datetime64[ns]",
    "dia_semana": "category",
    # "horario": "datetime64[ns]",
    "uf": "category",
    "br": "category",
    "km": "float64",
    "municipio": "category",
    "causa_acidente": "category",
    "tipo_acidente": "category",
    "classificacao_acidente": "category",
    "fase_dia": "category",
    "sentido_via": "category",
    "condicao_metereologica": "category",
    "tipo_pista": "category",
    "tracado_via": "category",
    "uso_solo": "category",
    "pessoas": "Int64",
    "mortos": "Int64",
    "feridos_leves": "Int64",
    "feridos_graves": "Int64",
    "ilesos": "Int64",
    "ignorados": "Int64",
    "feridos": "Int64",
    "veiculos": "Int64",
    "latitude": "float64",
    "longitude": "float64",
    "regional": "category",
    "delegacia": "category",
    "uop": "category",
    "ano": "Int64",
    "dia": "Int64",
    "mes": "Int64",
    "hora": "Int64",
    "minuto": "Int64",
}

df = pd.read_csv(file)
df = df.astype(dtypes)

print(df.head())

    dia_semana  uf     br     km              municipio                                     causa_acidente              tipo_acidente classificacao_acidente     fase_dia  sentido_via condicao_metereologica tipo_pista    tracado_via uso_solo  pessoas  mortos  feridos_leves  feridos_graves  ilesos  ignorados  feridos  veiculos   latitude  longitude regional delegacia             uop   ano  mes  dia  hora  minuto  cluster
0  sexta-feira  PR  277.0   51.3   SAO JOSE DOS PINHAIS                                 Pista Escorregadia  Saída de leito carroçável    Com Vítimas Feridas    Pleno dia  Decrescente         Garoa/Chuvisco      Dupla          Curva      Não        4       0              1               0       3          0        1         3 -25.595160 -48.907008  SPRF-PR  DEL01-PR  UOP05-DEL01-PR  2021    1    1    15      45       30
1  sexta-feira  SC  470.0   79.1                INDAIAL                             Transitar na contramão            Colisão frontal     Com Vítimas Fata

In [3]:
# Compute severity of each row
# --------------------------------------------------------------------------------------

# TODO: add this to preprocessing

# Define a function to determine the severity based on the given conditions
def get_severity(row):
    if row.mortos > 0:
        return 13
    if row.feridos_graves > 0:
        return 7
    if row.feridos_leves > 0:
        return 3
    if row.ilesos > 0:
        return 1
    # If none of the above fits, use classificaco_acidente
    if row.classificacao_acidente == "Com Vítimas Fatais":
        return 13
    if row.classificacao_acidente == "Com Vítimas Feridas":
        return 5
    if row.classificacao_acidente == "Sem Vítimas":
        return 1

    raise ValueError("Could not compute severity")

# Apply the function to create the "severidade" column
df["severidade"] = df.apply(get_severity, axis=1)
df.severidade.value_counts()

3     10999
7      3832
1      3739
13     1061
Name: severidade, dtype: int64

In [None]:
# OVERVIEW
# ======================================================================================

# - This table provides an overview of the high risk areas
# - We can see that the conditions in which accidents occurs are similar
# - From now on, top 3 clusters will be investigate in more detail

In [4]:
# Compute cluster metrics
# --------------------------------------------------------------------------------------

# Separate columns by data type
numericals = df._get_numeric_data().columns
categoricals = set(df.columns) - set(numericals)

# Define aggregation functions based on column type
aggregation = {col: "mean" for col in numericals if col != "cluster"}
aggregation.update({col: lambda x: x.mode().iloc[0] for col in categoricals})

# Apply aggregation functions
clusters = df.groupby("cluster")
metrics = clusters.agg(aggregation).reset_index()

metrics["size"] = clusters.size().reset_index(name="size")["size"]

# Drop columns that won't be investigated
to_drop = ["latitude", "longitude", "ano", "km", "br", "municipio", "sentido_via", "uso_solo", "regional", "delegacia", "uop", "pessoas", "mortos", "feridos_leves", "feridos_graves", "ilesos", "ignorados", "feridos", "veiculos", "minuto", "uf", "classificacao_acidente"]
metrics = metrics.drop(columns=to_drop)

# Sort metrics
# ======================================================================================
# NOTE: normalize by severidade / kilometers
kms = (
    # Group by "cluster" and "br" and selects the "km" column for further operations
    df.groupby(["cluster", "br"])["km"]
    # Within each group (combination of "cluster" and "br"), compute the absolute 
    # difference between the maximum and minimum values of the "km" column
    .apply(lambda x: abs(x.max() - x.min()))
    # Group the resulting data by the "cluster" column alone and calculates the sum of
    # the kilometers within each cluster
    .groupby("cluster")
    .sum()
    .reset_index(name="kilometers")
)

metrics["kilometers"] = kms.sort_values(by="kilometers", ascending=False).kilometers
metrics = metrics.assign(ratio=lambda x: x.severidade / x.kilometers)
metrics = metrics.sort_values(by="ratio", ascending=False)

# NOTE: normalize by severidade
# metrics = metrics.sort_values(by="severidade", ascending=False)
# ======================================================================================

# Sort the columns: numericals first, categoricals later
numericals = metrics._get_numeric_data().columns
categoricals = set(metrics.columns) - set(numericals)
columns = list(numericals) + list(categoricals)
metrics = metrics[columns]

top3 = metrics.cluster.head(3).to_list()
top6 = metrics.cluster.head(6).to_list()

c1 = df[df.cluster == top6[0]]
c2 = df[df.cluster == top6[1]]
c3 = df[df.cluster == top6[2]]
c4 = df[df.cluster == top6[3]]
c5 = df[df.cluster == top6[4]]
c6 = df[df.cluster == top6[5]]

print(metrics.head(10))

    cluster       mes        dia       hora  severidade  size  kilometers     ratio tracado_via   dia_semana                                     causa_acidente tipo_pista   fase_dia condicao_metereologica              tipo_acidente
29       29  6.520408  14.908163  13.081633    4.367347    98       111.0  0.039345        Reta  sexta-feira           Reação tardia ou ineficiente do condutor      Dupla  Pleno dia              Céu Claro  Saída de leito carroçável
15       15  6.335593  14.715254  13.015254    3.976271   590       110.5  0.035984        Reta       sábado           Reação tardia ou ineficiente do condutor      Dupla  Pleno dia              Céu Claro           Colisão traseira
46       46  6.239787  15.596803  13.420959    4.172291   563       124.1  0.033620        Reta  sexta-feira                     Ausência de reação do condutor    Simples  Pleno dia              Céu Claro        Colisão transversal
47       47  6.635514  16.495327  12.757009    4.214953   107       137.

In [5]:
# Plotting clusters on a map
# --------------------------------------------------------------------------------------

cluster_labels = top3
num_clusters = len(cluster_labels)

labels = df.cluster
color_scale = ["hsl(" + str(h) + ",50%" + ",50%)" for h in np.linspace(0, 360, num_clusters)]
fig = go.Figure()

for cluster_label, color in zip(cluster_labels, color_scale):
    # Skip noise
    if cluster_label == -1:
        continue

    cluster_mask = labels == cluster_label
    cluster_data = df[cluster_mask]

    scatter = go.Scattermapbox(
        lat=cluster_data.latitude,
        lon=cluster_data.longitude,
        mode="markers",
        marker=dict(size=7.5, color=color),
        # text=f"({cluster_label}) {cluster_data.causa_acidente}",
        text=f"{cluster_label}",
        showlegend=False,
    )

    fig.add_trace(scatter)

fig.update_layout(
    mapbox_style="open-street-map",
    mapbox=dict(center=dict(lat=-28, lon=-52), zoom=4.5),
    margin={"l": 0, "r": 0, "t": 0, "b": 0},
)

fig.show()

In [None]:
# EXPLORING TOP 3
# ======================================================================================

# - This table provides an overview of the high risk areas
# - We can see that the conditions in which accidents occurs are similar
# - From now on, top 3 clusters will be investigate in more detail
# 
# Parallel coordinates
# 
# - Use features with low cardinality to color the plot
# - Avoid spaguetthi plot: maybe filter even further, i.e. divide in more plots
# - Axes order: arrange the axes in a meaningful order, you can order them based on
# domain knowledge, natural progression, or hierarchical relationships

In [6]:
def group_others(dataset, n=5):
    copy = dataset.copy()
    
    numericals = copy._get_numeric_data().columns
    categoricals = set(copy.columns) - set(numericals)

    for column in categoricals:
        counts = copy[column].value_counts()
        top_labels = counts.nlargest(n).index.tolist()
        
        copy[column] = copy[column].apply(lambda x: x if x in top_labels else "Outros")
    
    return copy

In [16]:
# Features for evaluation
# --------------------------------------------------------------------------------------

# "causa_acidente"
# "tipo_acidente"
# 
# "condicao_metereologica"
# "fase_dia"
# "tipo_pista"
# "tracado_via"
# "dia_semana"
# 
# "horario"
# "data_inversa": "dia" + "mes" + "ano"

dimensions = [
    "tipo_pista",
    "tracado_via",
    "dia_semana",
    "condicao_metereologica",
    "fase_dia",
    # "causa_acidente",
    # "tipo_acidente"
]
colorscale = [[0, "lightsteelblue"], [1, "mediumseagreen"]]

In [18]:
# Parallel categories: c1
# --------------------------------------------------------------------------------------

# tmp = group_others(c1)
tmp = c1.copy()
tmp.severidade = tmp.severidade.apply(lambda x: 1 if x in [12, 7] else 0)

fig = px.parallel_categories(
    tmp,
    dimensions,
    color="severidade",
    color_continuous_scale=colorscale
)
fig.show()

In [19]:
# Parallel categories: c2
# --------------------------------------------------------------------------------------

# tmp = group_others(c2)
tmp = c3.copy()
tmp.severidade = tmp.severidade.apply(lambda x: 1 if x in [12, 7] else 0)

fig = px.parallel_categories(
    tmp,
    dimensions,
    color="severidade",
    color_continuous_scale=colorscale
)
fig.show()

In [20]:
# Parallel categories: c3
# --------------------------------------------------------------------------------------

# tmp = group_others(c3)
tmp = c3.copy()
tmp.severidade = tmp.severidade.apply(lambda x: 1 if x in [12, 7] else 0)

fig = px.parallel_categories(
    tmp,
    dimensions,
    color="severidade",
    color_continuous_scale=colorscale
)
fig.show()

In [21]:
import plotly.express as px

# Filter the DataFrame to include only rows with cluster values in top3
cluster = df[df.cluster.isin(top3)].copy()

# Order categories using top 1 cluster
category_orders = cluster[cluster["cluster"] == top3[0]]["tipo_acidente"].value_counts().sort_values(ascending=False).index.tolist()

# Group others
# cluster = group_others(cluster, 5)
# NOTE: can't grou, otherwise this plot is wrong, not all clusters have the same top categories!

# Categorize severidade
# cluster.severidade = cluster.severidade.apply(lambda x: "1" if x in [12, 7] else "0")
cluster.loc[:, "severidade"] = cluster.severidade.apply(lambda x: "1" if x in [12, 7] else "0")

# Group by cluster, tipo_acidente, and severidade, and calculate count
cluster = cluster.groupby(["cluster", "tipo_acidente", "severidade"]).size().reset_index(name="count")

# Order clusters based on top3 order
cluster = cluster.sort_values(by="cluster", key=lambda x: x.map({val: i for i, val in enumerate(top3)}))

# Create a facetted bar plot using px.bar
fig = px.bar(
    cluster,
    x="count",
    y="tipo_acidente",
    color="severidade",
    color_discrete_map={"0": "lightsteelblue", "1": "mediumseagreen"},
    orientation="h",
    facet_col="cluster",
    labels={"count": "Count", "tipo_acidente": "Accident Type"},
    title="Accident Types by Severity",
    height=600,
    category_orders={"tipo_acidente": category_orders}
)

# Customize layout
fig.update_layout(margin=dict(l=0, r=0, t=60, b=0))
fig.show()

In [22]:
import plotly.express as px

# Filter the DataFrame to include only rows with cluster values in top3
cluster = df[df.cluster.isin(top3)].copy()

# Order categories using top 1 cluster
category_orders = cluster[cluster["cluster"] == top3[0]]["causa_acidente"].value_counts().sort_values(ascending=False).index.tolist()

# Group others
# cluster = group_others(cluster, 5)
# NOTE: can't grou, otherwise this plot is wrong, not all clusters have the same top categories!

# Categorize severidade
cluster.severidade = cluster.severidade.apply(lambda x: "1" if x in [12, 7] else "0")

# Group by cluster, causa_acidente, and severidade, and calculate count
cluster = cluster.groupby(["cluster", "causa_acidente", "severidade"]).size().reset_index(name="count")

# Order clusters based on top3 order
cluster = cluster.sort_values(by="cluster", key=lambda x: x.map({val: i for i, val in enumerate(top3)}))

# Create a facetted bar plot using px.bar
fig = px.bar(
    cluster,
    x="count",
    y="causa_acidente",
    color="severidade",
    color_discrete_map={"0": "lightsteelblue", "1": "mediumseagreen"},
    orientation="h",
    facet_col="cluster",
    labels={"count": "Count", "causa_acidente": "Accident Type"},
    title="Accident Types by Severity",
    height=600,
    category_orders={"causa_acidente": category_orders}
)

# Customize layout
fig.update_layout(margin=dict(l=0, r=0, t=60, b=0))
fig.show()

In [None]:
# DRAFTS
# ======================================================================================

In [None]:
import plotly.graph_objs as go

cluster = df[df.cluster == top3[0]]
cluster = group_others(cluster)
cluster.severidade = cluster.severidade.apply(lambda x: 1 if x in [12, 7] else 0)
cluster = cluster.groupby(["tipo_acidente", "severidade"]).size().reset_index(name="count")

# Create a bar plot using go.Bar
fig = go.Figure()
colorscale = [[0, "lightsteelblue"], [1, "mediumseagreen"]]
colors = ["lightsteelblue", "mediumseagreen"]

for color, severity in zip(colors, cluster.severidade.unique()):
    subset = cluster[cluster.severidade == severity]
    subset = subset.sort_values(by="count", ascending=True)

    trace = go.Bar(
        x=subset["count"],
        y=subset.tipo_acidente,
        orientation="h",
        name=str(severity),
        # marker=dict(color=severity, colorscale=colorscale),
        marker_color=color
    )
    fig.add_trace(trace)

# Customize layout
fig.update_layout(
    title="Accident Types by Severity",
    xaxis_title="Count",
    yaxis_title="Accident Type",
    barmode="stack",
    # Adjust margins for longer tipo_acidente labels
    margin=dict(l=150, r=20, t=40, b=40)
)

fig.show()


In [None]:
from plotly.subplots import make_subplots

def horizontal_bar_labels(categories):
    subplots = make_subplots(
        rows=len(categories),
        cols=1,
        subplot_titles=[x["name"] for x in categories],
        shared_xaxes=True,
        print_grid=False,
        vertical_spacing=(0.45 / len(categories)),
    )
    subplots['layout'].update(width=550, plot_bgcolor='#fff')

    # add bars for the categories
    for k, x in enumerate(categories):
        subplots.add_trace(dict(
            type='bar',
            orientation='h',
            y=[x["name"]],
            x=[x["value"]],
            text=["{:,.0f}".format(x["value"])],
            hoverinfo='text',
            textposition='auto',
            marker=dict(color="#7030a0",),
        ), k + 1, 1)

    # update the layout
    subplots['layout'].update(showlegend=False)
    for x in subplots["layout"]['annotations']:
        x['x'] = 0
        x['xanchor'] = 'left'
        x['align'] = 'left'
        x['font'] = dict(size=12,)

    # hide the axes
    for axis in subplots['layout']:
        if axis.startswith('yaxis') or axis.startswith('xaxis'):
            subplots['layout'][axis]['visible'] = False

    # update the margins and size
    subplots['layout']['margin'] = {
        'l': 0,
        'r': 0,
        't': 20,
        'b': 1,
    }
    height_calc = 45 * len(categories)
    height_calc = max([height_calc, 350])
    subplots['layout']['height'] = height_calc
    subplots['layout']['width'] = height_calc

    return subplots

cluster = df[df.cluster == top3[0]]
cluster = group_others(cluster)
    
top_acidentes = cluster.tipo_acidente.value_counts()
categories = [
    {"name": "Musée du Louvre, Paris", "value": 10200000},
    {"name": "National Museum of China, Beijing", "value": 8610092},
    {"name": "Metropolitan Museum of Art, New York City", "value": 6953927},
    {"name": "Vatican Museums, Vatican City", "value": 6756186},
    {"name": "Tate Modern, London", "value": 5868562},
    {"name": "British Museum, London", "value": 5820000},
    {"name": "National Gallery, London", "value": 5735831},
    {"name": "National Gallery of Art, Washington D.C.", "value": 4404212},
    {"name": "State Hermitage Museum, Saint Petersburg", "value": 4220000},
    {"name": "Victoria and Albert Museum, London", "value": 3967566},
]
horizontal_bar_labels(categories)