In [None]:
# DATA UNDERSTANDING
# --------------------------------------------------------------------------------------

# Collect initial data: Acquire the necessary data and (if necessary) load it into your analysis tool.
# Describe data: Examine the data and document its surface properties like data format, number of records, or field identities.
# Explore data: Dig deeper into the data. Query it, visualize it, and identify relationships among the data.
# Verify data quality: How clean/dirty is the data? Document any quality issues.
# 
# + Visualize the distribution of accidents by various features (e.g., day of the week,
# weather condition, type of road, etc.) using bar plots, histograms, or pie charts.
# + Identify correlations between features and accident severity (e.g., total deaths,
# severe injuries) using correlation matrices or pair plots.

In [None]:
# CONCLUSIONS
# --------------------------------------------------------------------------------------

# - replace comma by dot: km, latitude and longitude
# - filter uf: PR, SC and RS
# - specify dtypes
# 
# - missing cells	187 (< 0.1%)
# - mortos has 18698 (94.6%) zeros
# - classificacao_acidente has 1072 (5.4%) "Com Vitimas Fatais"
# - no interestings correlations

In [1]:
import os

import pandas as pd
from ydata_profiling import ProfileReport

pd.set_option("display.max_columns", None)
pd.set_option("display.expand_frame_repr", False)

os.makedirs(f"../output/b-data-understanding", exist_ok=True)
os.makedirs(f"../data/b-data-understanding", exist_ok=True)

YEAR = "2022"

In [None]:
# Profilling data grouped by occurrence (datatran2021.csv)
# ======================================================================================

In [2]:
# Read data

file1 = f"../data/raw/{YEAR}-grouped-by-occurrence/datatran{YEAR}.csv"

df1 = pd.read_csv(file1, sep=";", decimal=",", encoding="iso-8859-1")
print(df1.describe(include="all"))

                   id data_inversa dia_semana   horario     uf            br            km municipio                            causa_acidente     tipo_acidente classificacao_acidente   fase_dia sentido_via condicao_metereologica tipo_pista tracado_via uso_solo       pessoas        mortos  feridos_leves  feridos_graves        ilesos     ignorados       feridos      veiculos      latitude     longitude regional delegacia             uop
count    64547.000000        64547      64547     64547  64547  64350.000000  64350.000000     64547                                     64547             64547                  64547      64547       64547                  64547      64547       64547    64547  64547.000000  64547.000000   64547.000000    64547.000000  64547.000000  64547.000000  64547.000000  64547.000000  64547.000000  64547.000000    64544     63788           64499
unique            NaN          365          7      1387     27           NaN           NaN      1801                    

In [3]:
# Basic filtering and data type conversion

dtypes1 = {
    "id": int,
    # Change it to a more specific time-related type if needed (e.g. "datetime64[ns]")
    "data_inversa": "object",
    "dia_semana": "category",
    # Change it to a more specific time-related type if needed
    "horario": "object",
    "uf": "category",
    "br": "category",
    # TODO: category or float64?
    "km": "float64",
    "municipio": "category",
    "causa_acidente": "category",
    "tipo_acidente": "category",
    "classificacao_acidente": "category",
    "fase_dia": "category",
    "sentido_via": "category",
    "condicao_metereologica": "category",
    "tipo_pista": "category",
    "tracado_via": "category",
    "uso_solo": "category",
    "pessoas": int,
    "mortos": int,
    "feridos_leves": int,
    "feridos_graves": int,
    "ilesos": int,
    "ignorados": int,
    "feridos": int,
    "veiculos": int,
    "latitude": "float64",
    "longitude": "float64",
    "regional": "category",
    "delegacia": "category",
    "uop": "category"
}

df1 = df1[df1.uf.isin(["PR", "SC", "RS"])]
df1 = df1.astype(dtypes1)

In [4]:
described1 = df1.describe(include="all")
described1.loc["nan"] = df1.isnull().mean() * 100

print(described1)

                   id data_inversa dia_semana   horario     uf            br            km municipio                            causa_acidente              tipo_acidente classificacao_acidente   fase_dia sentido_via condicao_metereologica tipo_pista tracado_via uso_solo       pessoas        mortos  feridos_leves  feridos_graves        ilesos     ignorados       feridos      veiculos      latitude     longitude regional delegacia             uop
count    19895.000000        19895      19895     19895  19895  19847.000000  19847.000000     19895                                     19895                      19895                  19895      19895       19895                  19895      19895       19895    19895  19895.000000  19895.000000   19895.000000    19895.000000  19895.000000  19895.000000  19895.000000  19895.000000  19895.000000  19895.000000    19895     19895           19884
unique            NaN          365          7       978      3     28.000000           NaN       448  

In [5]:
print(df1.head())

        id data_inversa dia_semana   horario  uf     br     km              municipio                                     causa_acidente                   tipo_acidente classificacao_acidente   fase_dia  sentido_via condicao_metereologica tipo_pista    tracado_via uso_solo  pessoas  mortos  feridos_leves  feridos_graves  ilesos  ignorados  feridos  veiculos   latitude  longitude regional delegacia             uop
1   405158   2022-01-01     sábado  02:40:00  PR  116.0   33.0  CAMPINA GRANDE DO SUL                   Ingestão de álcool pelo condutor                      Tombamento     Com Vítimas Fatais  Pleno dia  Decrescente                Nublado      Dupla          Curva      Não        2       1              1               0       0          0        1         1 -25.114403 -48.846755  SPRF-PR  DEL01-PR  UOP02-DEL01-PR
7   405234   2022-01-01     sábado  08:05:00  SC  163.0   80.1             GUARACIABA                     Ausência de reação do condutor                 Colisão front

In [6]:
# Profile report

dst = f"../output/b-data-understanding/{YEAR}-grouped-by-occurrence.html"
profile1 = ProfileReport(df1, title="Profiling Report")
profile1.to_file(dst)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [7]:
# Plotting the data on a map

import plotly.graph_objects as go

fig = go.Figure()

scatter = go.Scattermapbox(
    lat=df1.latitude,
    lon=df1.longitude,
    mode="markers",
    marker=dict(size=5),
    text=df1.causa_acidente,
)

fig.add_trace(scatter)

fig.update_layout(
    mapbox_style="open-street-map",
    mapbox=dict(center=dict(lat=-28, lon=-52), zoom=4.5),
    margin={"l": 0, "r": 0, "t": 0, "b": 0},
)

fig.show()

In [8]:
import plotly.express as px
import numpy as np
import scipy.stats

# To set the z value in a spatial heatmap according to the frequency of data points
# close to each location, you can use Kernel Density Estimation (KDE) to estimate the
# density of points at different locations. KDE is a technique that calculates the
# density of points in a continuous space, which allows you to create a smooth density
# estimation that represents the intensity of data points in the vicinity of each
# location.

data = np.vstack([df1.latitude, df1.longitude])
kde = scipy.stats.gaussian_kde(data)
values = kde(data)

fig = px.density_mapbox(
    lat=df1.latitude,
    lon=df1.longitude,
    z=values,
    radius=4,
)

fig.update_layout(
    mapbox_style="open-street-map",
    mapbox=dict(center=dict(lat=-28, lon=-52), zoom=4.5),
    margin={"l": 0, "r": 0, "t": 0, "b": 0},
)

fig.show()

In [9]:
dst = f"../data/b-data-understanding/{YEAR}-grouped-by-occurrence.csv"
df1.to_csv(dst, index=False)

In [None]:
# Profilling data grouped by people (acidentes2021.csv)
# ======================================================================================

In [None]:
# Read data

file2 = f"../data/raw/{YEAR}-grouped-by-people/acidentes{YEAR}.csv"

df2 = pd.read_csv(file2, sep=";", decimal=",", encoding="iso-8859-1")
df2.describe(include="all")

In [None]:
# Basic filtering and data type conversion

# df2 = df2.drop(columns=["pesid", "id_veiculo"])

dtypes2 = {
    "id": int,
    "pesid": int,
    # Change it to a more specific time-related type if needed (e.g. "datetime64[ns]")
    "data_inversa": "object",
    "dia_semana": "category",
    # Change it to a more specific time-related type if needed
    "horario": "category",
    "uf": "category",
    "br": "category",
    # TODO: category or float64?
    "km": "float64",
    "municipio": "category",
    "causa_acidente": "category",
    "tipo_acidente": "category",
    "classificacao_acidente": "category",
    "fase_dia": "category",
    "sentido_via": "category",
    "condicao_metereologica": "category",
    "tipo_pista": "category",
    "tracado_via": "category",
    "uso_solo": "category",
    "id_veiculo": int,
    "tipo_veiculo": "category",
    "marca": "category",
    # IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
    # "Int64": Nullable integer data type
    "ano_fabricacao_veiculo": "Int64",
    "tipo_envolvido": "category",
    "estado_fisico": "category",
    # IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
    # "Int64": Nullable integer data type
    "idade": "Int64",
    "sexo": "category",
    "ilesos": int,
    "feridos_leves": int,
    "feridos_graves": int,
    "mortos": int,
    "latitude": "float64",
    "longitude": "float64",
    "regional": "category",
    "delegacia": "category",
    "uop": "category"
}

df2 = df2[df2.uf.isin(["PR", "SC", "RS"])]
df2 = df2.astype(dtypes2)

In [None]:
described2 = df2.describe(include="all")
described2.loc["nan"] = df2.isnull().mean() * 100
described2

In [None]:
df2.head()

In [None]:
dst = f"../data/b-data-understanding/{YEAR}-grouped-by-people.csv"
df2.to_csv(dst, index=False)

In [None]:
# Profile report

dst = f"../output/b-data-understanding/{YEAR}-grouped-by-people.html"
profile2 = ProfileReport(df2, title="Profiling Report")
profile2.to_file(dst)

In [None]:
# Profilling data grouped by people all (acidentes2021_todas_causas_tipos.csv)
# ======================================================================================

In [None]:
# Read data

file3 = f"../data/raw/{YEAR}-grouped-by-people-all/acidentes{YEAR}_todas_causas_tipos.csv"

df3 = pd.read_csv(file3, sep=";", decimal=",", encoding="iso-8859-1")
df3.describe(include="all")

In [None]:
# Basic filtering and data type conversion

dtypes3 = {
    "id": int,
    # IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
    # "Int64": Nullable integer data type
    "pesid": "Int64",
    # Change it to a more specific time-related type if needed (e.g. "datetime64[ns]")
    "data_inversa": "category",
    "dia_semana": "category",
    # Change it to a more specific time-related type if needed
    "horario": "category",
    "uf": "category",
    "br": "category",
    # TODO: category or float64?
    "km": "float64",
    "municipio": "category",
    "causa_principal": "category",
    "causa_acidente": "category",
    "ordem_tipo_acidente": int,
    "tipo_acidente": "category",
    "classificacao_acidente": "category",
    "fase_dia": "category",
    "sentido_via": "category",
    "condicao_metereologica": "category",
    "tipo_pista": "category",
    "tracado_via": "category",
    "uso_solo": "category",
    "id_veiculo": int,
    "tipo_veiculo": "category",
    "marca": "category",
    # IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
    # "Int64": Nullable integer data type
    "ano_fabricacao_veiculo": "Int64",
    "tipo_envolvido": "category",
    "estado_fisico": "category",
    # IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
    # "Int64": Nullable integer data type
    "idade": "Int64",
    "sexo": "category",
    "ilesos": int,
    "feridos_leves": int,
    "feridos_graves": int,
    "mortos": int,
    "latitude": "float64",
    "longitude": "float64",
    "regional": "category",
    "delegacia": "category",
    "uop": "category"
}

df3 = df3[df3.uf.isin(["PR", "SC", "RS"])]
df3 = df3.astype(dtypes3)

In [None]:
described3 = df3.describe(include="all")
described3.loc["nan"] = df3.isnull().mean() * 100
described3

In [None]:
df3.head()

In [None]:
# Profile report

dst = f"../output/b-data-understanding/{YEAR}-grouped-by-people-all.html"
profile3 = ProfileReport(df3, title="Profiling Report")
profile3.to_file(dst)

In [None]:
dst = f"../data/b-data-understanding/{YEAR}-grouped-by-people-all.csv"
df3.to_csv(dst, index=False)