In [None]:
# DATA UNDERSTANDING
# --------------------------------------------------------------------------------------

# Collect initial data: Acquire the necessary data and (if necessary) load it into your analysis tool.
# Describe data: Examine the data and document its surface properties like data format, number of records, or field identities.
# Explore data: Dig deeper into the data. Query it, visualize it, and identify relationships among the data.
# Verify data quality: How clean/dirty is the data? Document any quality issues.
# 
# + Visualize the distribution of accidents by various features (e.g., day of the week,
# weather condition, type of road, etc.) using bar plots, histograms, or pie charts.
# + Identify correlations between features and accident severity (e.g., total deaths,
# severe injuries) using correlation matrices or pair plots.

In [None]:
# CONCLUSIONS
# --------------------------------------------------------------------------------------

# missing cells	187 (< 0.1%)
# mortos has 18698 (94.6%) zeros
# classificacao_acidente has 1072 (5.4%) "Com Vitimas Fatais"
# no interestings correlations
# zoom
# [x] replace comma by dot: km, latitude and longitude
# [x] filter uf: PR, SC and RS
# [x] specify dtypes

In [46]:
import os

import pandas as pd
from ydata_profiling import ProfileReport

pd.set_option("display.max_columns", None)
YEAR = "2021"

os.makedirs(f"../output/b-data-understanding", exist_ok=True)
os.makedirs(f"../data/b-data-understanding", exist_ok=True)

In [None]:
# Profilling data grouped by occurrence (datatran2021.csv)
# ======================================================================================

In [7]:
# Read data

file1 = f"../data/raw/{YEAR}-grouped-by-occurrence/datatran2021.csv"

df1 = pd.read_csv(file1, sep=";", decimal=",", encoding="iso-8859-1")
df1.describe(include="all")

Unnamed: 0,id,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,pessoas,mortos,feridos_leves,feridos_graves,ilesos,ignorados,feridos,veiculos,latitude,longitude,regional,delegacia,uop
count,64539.0,64539,64539,64539,64539,64372.0,64372.0,64539,64539,64539,64539,64539,64539,64539,64539,64539,64539,64539.0,64539.0,64539.0,64539.0,64539.0,64539.0,64539.0,64539.0,64539.0,64539.0,64539,64521,64407
unique,,365,7,1302,27,,,1791,71,17,3,4,3,10,3,10,2,,,,,,,,,,,27,150,369
top,,2021-08-08,sábado,19:00:00,MG,,,BRASILIA,Reação tardia ou ineficiente do condutor,Colisão traseira,Com Vítimas Feridas,Pleno dia,Crescente,Céu Claro,Simples,Reta,Não,,,,,,,,,,,SPRF-MG,DEL01-PR,UOP01-DEL01-SC
freq,,281,10869,941,8327,,,981,6907,12312,48186,34785,33898,38983,31805,39054,35812,,,,,,,,,,,8305,3032,1035
mean,369745.487194,,,,,210.660163,258.676195,,,,,,,,,,,2.338508,0.083608,0.839508,0.27371,0.981639,0.160043,1.113218,1.653791,-18.940569,-46.668963,,,
std,21954.249547,,,,,130.673217,224.921727,,,,,,,,,,,1.86681,0.340734,1.029122,0.612703,1.440071,0.473669,1.15155,0.721653,7.781718,6.186292,,,
min,331693.0,,,,,10.0,0.0,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-33.689819,-72.11938,,,
25%,351095.5,,,,,101.0,78.0,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-25.364362,-50.335156,,,
50%,369790.0,,,,,158.0,192.65,,,,,,,,,,,2.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,-20.466319,-47.8671,,,
75%,388100.5,,,,,324.0,406.825,,,,,,,,,,,3.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,-12.933839,-42.623609,,,


In [8]:
# Basic filtering and data type conversion

dtypes1 = {
    "id": int,
    # Change it to a more specific time-related type if needed (e.g. "datetime64[ns]")
    "data_inversa": "object",
    "dia_semana": "category",
    # Change it to a more specific time-related type if needed
    "horario": "object",
    "uf": "category",
    "br": "category",
    # TODO: category or float64?
    "km": "float64",
    "municipio": "category",
    "causa_acidente": "category",
    "tipo_acidente": "category",
    "classificacao_acidente": "category",
    "fase_dia": "category",
    "sentido_via": "category",
    "condicao_metereologica": "category",
    "tipo_pista": "category",
    "tracado_via": "category",
    "uso_solo": "category",
    "pessoas": int,
    "mortos": int,
    "feridos_leves": int,
    "feridos_graves": int,
    "ilesos": int,
    "ignorados": int,
    "feridos": int,
    "veiculos": int,
    "latitude": "float64",
    "longitude": "float64",
    "regional": "category",
    "delegacia": "category",
    "uop": "category"
}

df1 = df1[df1.uf.isin(["PR", "SC", "RS"])]
df1 = df1.astype(dtypes1)

In [91]:
described1 = df1.describe(include="all")
described1.loc["nan"] = df1.isnull().mean() * 100
described1

Unnamed: 0,id,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,pessoas,mortos,feridos_leves,feridos_graves,ilesos,ignorados,feridos,veiculos,latitude,longitude,regional,delegacia,uop
count,19770.0,19770,19770,19770,19770,19727.0,19727.0,19770,19770,19770,19770,19770,19770,19770,19770,19770,19770,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770,19770,19669
unique,,365,7,903,3,29.0,,445,69,17,3,4,3,10,3,10,2,,,,,,,,,,,3,29,93
top,,2021-08-08,sábado,19:00:00,SC,101.0,,CURITIBA,Velocidade Incompatível,Colisão traseira,Com Vítimas Feridas,Pleno dia,Crescente,Céu Claro,Simples,Reta,Não,,,,,,,,,,,SPRF-SC,DEL01-PR,UOP01-DEL01-SC
freq,,101,3403,254,7890,4251.0,,900,2329,3888,14922,10610,10305,10885,9455,11398,10363,,,,,,,,,,,7918,3032,1035
mean,369542.644461,,,,,,221.504648,,,,,,,,,,,2.291047,0.062873,0.837683,0.246232,1.00607,0.138189,1.083915,1.686141,-26.936908,-50.60901,,,
std,21759.173611,,,,,,179.519283,,,,,,,,,,,1.639392,0.310085,0.944891,0.565921,1.251622,0.399331,1.047696,0.71906,1.969246,1.856358,,,
min,331693.0,,,,,,0.0,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-33.689819,-57.502806,,,
25%,350965.75,,,,,,92.0,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-28.19626,-51.914951,,,
50%,369583.0,,,,,,173.0,,,,,,,,,,,2.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,-26.880509,-50.158686,,,
75%,388376.75,,,,,,313.0,,,,,,,,,,,3.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,-25.502576,-48.956708,,,


In [11]:
df1.head()

Unnamed: 0,id,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,pessoas,mortos,feridos_leves,feridos_graves,ilesos,ignorados,feridos,veiculos,latitude,longitude,regional,delegacia,uop
6,331855,2021-01-01,sexta-feira,15:45:00,PR,277.0,51.3,SAO JOSE DOS PINHAIS,Pista Escorregadia,Saída de leito carroçável,Com Vítimas Feridas,Pleno dia,Decrescente,Garoa/Chuvisco,Dupla,Curva,Não,4,0,1,0,3,0,1,3,-25.59516,-48.907008,SPRF-PR,DEL01-PR,UOP05-DEL01-PR
8,331864,2021-01-01,sexta-feira,17:10:00,SC,470.0,79.1,INDAIAL,Transitar na contramão,Colisão frontal,Com Vítimas Fatais,Pleno dia,Crescente,Nublado,Simples,Reta,Não,9,2,5,1,0,1,6,3,-26.951565,-49.306534,SPRF-SC,DEL04-SC,UOP01-DEL04-SC
10,331910,2021-01-01,sexta-feira,19:50:00,SC,470.0,130.0,LONTRAS,Acessar a via sem observar a presença dos outr...,Colisão transversal,Com Vítimas Feridas,Plena Noite,Decrescente,Nublado,Simples,Não Informado,Sim,4,0,0,2,2,0,2,2,-27.160376,-49.55658,SPRF-SC,DEL04-SC,UOP02-DEL04-SC
11,331920,2021-01-01,sexta-feira,20:40:00,PR,116.0,59.0,CAMPINA GRANDE DO SUL,Entrada inopinada do pedestre,Atropelamento de Pedestre,Com Vítimas Fatais,Plena Noite,Crescente,Chuva,Dupla,Não Informado,Sim,2,1,0,0,1,0,0,1,-25.303357,-48.943789,SPRF-PR,DEL01-PR,UOP02-DEL01-PR
18,331979,2021-01-02,sábado,09:25:00,SC,101.0,112.0,ITAJAI,Trafegar com motocicleta (ou similar) entre as...,Colisão traseira,Com Vítimas Feridas,Pleno dia,Decrescente,Sol,Dupla,Não Informado,Não,4,0,1,1,2,0,2,3,-26.84589,-48.721313,SPRF-SC,DEL04-SC,UOP04-DEL04-SC


In [14]:
# Profile report

profile1 = ProfileReport(df1, title="Profiling Report")
profile1.to_file("../output/b-data-understanding/profile1.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [95]:
# Plotting the data on a map

import plotly.graph_objects as go

fig = go.Figure()

scatter = go.Scattermapbox(
    lat=df1.latitude,
    lon=df1.longitude,
    mode="markers",
    marker=dict(size=5),
    text=df1.causa_acidente,  # Display on hover
)

fig.add_trace(scatter)

fig.update_layout(
    mapbox_style="open-street-map",
    mapbox=dict(center=dict(lat=-28, lon=-52), zoom=4.5),
    margin={"l": 0, "r": 0, "t": 0, "b": 0},
)

fig.show()

In [44]:
import plotly.express as px
import numpy as np
import scipy.stats

# To set the z value in a spatial heatmap according to the frequency of data points
# close to each location, you can use Kernel Density Estimation (KDE) to estimate the
# density of points at different locations. KDE is a technique that calculates the
# density of points in a continuous space, which allows you to create a smooth density
# estimation that represents the intensity of data points in the vicinity of each
# location.

data = np.vstack([df1.latitude, df1.longitude])
kde = scipy.stats.gaussian_kde(data)
values = kde(data)

fig = px.density_mapbox(
    lat=df1.latitude,
    lon=df1.longitude,
    z=values,
    radius=4,
)

fig.update_layout(
    mapbox_style="open-street-map",
    mapbox=dict(center=dict(lat=-28, lon=-52), zoom=4.5),
    margin={"l": 0, "r": 0, "t": 0, "b": 0},
)

fig.show()

In [47]:
df1.to_csv(f"../data/b-data-understanding/{YEAR}-grouped-by-occurrence.csv", index=False)

In [None]:
# Profilling data grouped by people (acidentes2021.csv)
# ======================================================================================

In [61]:
# Read data

file2 = f"../data/raw/{YEAR}-grouped-by-people/acidentes2021.csv"

df2 = pd.read_csv(file2, sep=";", decimal=",", encoding="iso-8859-1")
df2.describe(include="all")

Unnamed: 0,id,pesid,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,id_veiculo,tipo_veiculo,marca,ano_fabricacao_veiculo,tipo_envolvido,estado_fisico,idade,sexo,ilesos,feridos_leves,feridos_graves,mortos,latitude,longitude,regional,delegacia,uop
count,150925.0,150924.0,150925,150925,150925,150925,150500.0,150500.0,150925,150925,150925,150925,150925,150925,150925,150925,150925,150925,150925.0,150925,142377,140397.0,150925,150925,135653.0,150925,150925.0,150925.0,150925.0,150925.0,150925.0,150925.0,150925,150868,150589
unique,,,365,7,1302,27,,,1791,71,17,3,4,3,10,3,10,2,,25,5638,,6,5,,4,,,,,,,27,150,369
top,,,2021-08-08,domingo,19:00:00,MG,,,BRASILIA,Reação tardia ou ineficiente do condutor,Colisão traseira,Com Vítimas Feridas,Pleno dia,Crescente,Céu Claro,Simples,Reta,Não,,Automóvel,VW/GOL 1.0/GOL 1.0/GOL 1.0/GOL 1.0/GOL 1.0/GOL...,,Condutor,Ileso,,Masculino,,,,,,,SPRF-MG,DEL01-PR,UOP01-DEL01-SC
freq,,,733,25638,2174,19058,,,2432,15873,34662,114415,82409,79683,90943,77294,92564,84359,,64113,1728,,106707,63354,,107396,,,,,,,19017,6824,2202
mean,370257.305423,826272.7,,,,,211.213681,261.387833,,,,,,,,,,,668090.466848,,,2010.756092,,,39.606761,,0.419771,0.358993,0.117045,0.035753,-18.775947,-46.675791,,,
std,22338.491324,50357.1,,,,,130.729808,226.563315,,,,,,,,,,,42070.510589,,,7.739965,,,46.331935,,0.493523,0.479707,0.321475,0.185674,7.826366,6.214617,,,
min,331693.0,738204.0,,,,,10.0,0.0,,,,,,,,,,,595095.0,,,1900.0,,,0.0,,0.0,0.0,0.0,0.0,-33.689819,-72.11938,,,
25%,351360.0,783361.8,,,,,101.0,79.0,,,,,,,,,,,632034.0,,,2007.0,,,27.0,,0.0,0.0,0.0,0.0,-25.271278,-50.36576,,,
50%,370459.0,826302.5,,,,,158.0,195.3,,,,,,,,,,,668743.0,,,2012.0,,,37.0,,0.0,0.0,0.0,0.0,-20.353744,-47.775077,,,
75%,388722.0,868534.2,,,,,324.0,410.0,,,,,,,,,,,703526.0,,,2017.0,,,48.0,,1.0,1.0,0.0,0.0,-12.725957,-42.600677,,,


In [76]:
# Basic filtering and data type conversion

# df2 = df2.drop(columns=["pesid", "id_veiculo"])

dtypes2 = {
    "id": int,
    "pesid": int,
    # Change it to a more specific time-related type if needed (e.g. "datetime64[ns]")
    "data_inversa": "object",
    "dia_semana": "category",
    # Change it to a more specific time-related type if needed
    "horario": "category",
    "uf": "category",
    "br": "category",
    # TODO: category or float64?
    "km": "float64",
    "municipio": "category",
    "causa_acidente": "category",
    "tipo_acidente": "category",
    "classificacao_acidente": "category",
    "fase_dia": "category",
    "sentido_via": "category",
    "condicao_metereologica": "category",
    "tipo_pista": "category",
    "tracado_via": "category",
    "uso_solo": "category",
    "id_veiculo": int,
    "tipo_veiculo": "category",
    "marca": "category",
    # IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
    # "Int64": Nullable integer data type
    "ano_fabricacao_veiculo": "Int64",
    "tipo_envolvido": "category",
    "estado_fisico": "category",
    # IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
    # "Int64": Nullable integer data type
    "idade": "Int64",
    "sexo": "category",
    "ilesos": int,
    "feridos_leves": int,
    "feridos_graves": int,
    "mortos": int,
    "latitude": "float64",
    "longitude": "float64",
    "regional": "category",
    "delegacia": "category",
    "uop": "category"
}

df2 = df2[df2.uf.isin(["PR", "SC", "RS"])]
df2 = df2.astype(dtypes2)

In [88]:
described2 = df2.describe(include="all")
described2.loc["nan"] = df2.isnull().mean() * 100
described2

Unnamed: 0,id,pesid,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,id_veiculo,tipo_veiculo,marca,ano_fabricacao_veiculo,tipo_envolvido,estado_fisico,idade,sexo,ilesos,feridos_leves,feridos_graves,mortos,latitude,longitude,regional,delegacia,uop
count,45294.0,45294.0,45294,45294,45294,45294,45189.0,45189.0,45294,45294,45294,45294,45294,45294,45294,45294,45294,45294,45294.0,45294,43363,42985.0,45294,45294,41434.0,45294,45294.0,45294.0,45294.0,45294.0,45294.0,45294.0,45294,45294,45050
unique,,,365,7,903,3,29.0,,445,69,17,3,4,3,10,3,10,2,,23,4051,,5,5,,4,,,,,,,3,29,93
top,,,2021-08-08,sábado,19:00:00,SC,101.0,,CURITIBA,Velocidade Incompatível,Colisão traseira,Com Vítimas Feridas,Pleno dia,Crescente,Céu Claro,Simples,Reta,Não,,Automóvel,VW/GOL 1.0/GOL 1.0/GOL 1.0/GOL 1.0/GOL 1.0/GOL...,,Condutor,Ileso,,Masculino,,,,,,,SPRF-SC,DEL01-PR,UOP01-DEL01-SC
freq,,,254,7949,629,17700,9279.0,,1967,4762,10661,35251,25015,23909,25188,22331,26154,23837,,21667,580,,33324,19890,,31936,,,,,,,17761,6824,2202
mean,369779.087738,825204.775467,,,,,,223.602494,,,,,,,,,,,667259.903144,,,2009.707945,,,38.957692,,0.439131,0.365633,0.107476,0.027443,-26.9353,-50.643771,,,
std,21912.938871,49683.469822,,,,,,180.635497,,,,,,,,,,,41577.640559,,,8.242937,,,45.466436,,0.496287,0.481613,0.30972,0.163372,1.972519,1.853267,,,
min,331693.0,738216.0,,,,,,0.0,,,,,,,,,,,595103.0,,,1900.0,,,0.0,,0.0,0.0,0.0,0.0,-33.689819,-57.502806,,,
25%,351063.0,782620.25,,,,,,92.4,,,,,,,,,,,631509.25,,,2006.0,,,26.0,,0.0,0.0,0.0,0.0,-28.237665,-51.9375,,,
50%,369945.0,824888.5,,,,,,174.4,,,,,,,,,,,667671.0,,,2011.0,,,36.0,,0.0,0.0,0.0,0.0,-26.879056,-50.228897,,,
75%,388620.0,868126.75,,,,,,321.0,,,,,,,,,,,703295.0,,,2016.0,,,48.0,,1.0,1.0,0.0,0.0,-25.498782,-48.982243,,,


In [78]:
df2.head()

Unnamed: 0,id,pesid,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,id_veiculo,tipo_veiculo,marca,ano_fabricacao_veiculo,tipo_envolvido,estado_fisico,idade,sexo,ilesos,feridos_leves,feridos_graves,mortos,latitude,longitude,regional,delegacia,uop
7,331855,738445,2021-01-01,sexta-feira,15:45:00,PR,277.0,51.3,SAO JOSE DOS PINHAIS,Pista Escorregadia,Saída de leito carroçável,Com Vítimas Feridas,Pleno dia,Decrescente,Garoa/Chuvisco,Dupla,Curva,Não,595297,Caminhão,VW/8.160 DRC 4X2/8.160 DRC 4X2/8.160 DRC 4X2/8...,2016,Condutor,Ileso,39.0,Masculino,1,0,0,0,-25.59516,-48.907008,SPRF-PR,DEL01-PR,UOP05-DEL01-PR
9,331864,738673,2021-01-01,sexta-feira,17:10:00,SC,470.0,79.1,INDAIAL,Transitar na contramão,Colisão frontal,Com Vítimas Fatais,Pleno dia,Crescente,Nublado,Simples,Reta,Não,595462,Caminhão,FORD/CARGO 2422 T/CARGO 2422 T/CARGO 2422 T/CA...,2005,Passageiro,Lesões Leves,27.0,Feminino,0,1,0,0,-26.951565,-49.306534,SPRF-SC,DEL04-SC,UOP01-DEL04-SC
10,331864,738672,2021-01-01,sexta-feira,17:10:00,SC,470.0,79.1,INDAIAL,Transitar na contramão,Colisão frontal,Com Vítimas Fatais,Pleno dia,Crescente,Nublado,Simples,Reta,Não,595462,Caminhão,FORD/CARGO 2422 T/CARGO 2422 T/CARGO 2422 T/CA...,2005,Condutor,Lesões Leves,24.0,Masculino,0,1,0,0,-26.951565,-49.306534,SPRF-SC,DEL04-SC,UOP01-DEL04-SC
11,331864,741989,2021-01-01,sexta-feira,17:10:00,SC,470.0,79.1,INDAIAL,Transitar na contramão,Colisão frontal,Com Vítimas Fatais,Pleno dia,Crescente,Nublado,Simples,Reta,Não,595462,Caminhão,FORD/CARGO 2422 T/CARGO 2422 T/CARGO 2422 T/CA...,2005,Testemunha,Não Informado,,Não Informado,0,0,0,0,-26.951565,-49.306534,SPRF-SC,DEL04-SC,UOP01-DEL04-SC
13,331910,738556,2021-01-01,sexta-feira,19:50:00,SC,470.0,130.0,LONTRAS,Acessar a via sem observar a presença dos outr...,Colisão transversal,Com Vítimas Feridas,Plena Noite,Decrescente,Nublado,Simples,Não Informado,Sim,595375,Caminhão-trator,SCANIA/P 360 A4X2/P 360 A4X2/P 360 A4X2/P 360 ...,2018,Condutor,Ileso,37.0,Masculino,1,0,0,0,-27.160376,-49.55658,SPRF-SC,DEL04-SC,UOP02-DEL04-SC


In [79]:
df2.to_csv(f"../data/b-data-understanding/{YEAR}-grouped-by-people.csv", index=False)

In [80]:
# Profile report

profile2 = ProfileReport(df2, title="Profiling Report")
profile2.to_file("../output/b-data-understanding/profile2.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Profilling data grouped by people all (acidentes2021_todas_causas_tipos.csv)
# ======================================================================================

In [82]:
# Read data

file3 = f"../data/raw/{YEAR}-grouped-by-people-all/acidentes2021_todas_causas_tipos.csv"

df3 = pd.read_csv(file3, sep=";", decimal=",", encoding="iso-8859-1")
df3.describe(include="all")

Unnamed: 0,id,pesid,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_principal,causa_acidente,ordem_tipo_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,id_veiculo,tipo_veiculo,marca,ano_fabricacao_veiculo,tipo_envolvido,estado_fisico,idade,sexo,ilesos,feridos_leves,feridos_graves,mortos,latitude,longitude,regional,delegacia,uop
count,449018.0,409147.0,449018,449018,449018,449018,447870.0,447870.0,449018,449018,449018,449018.0,449018,449018,449018,449018,449018,449018,449018,449018,449018.0,449018,428080,422909.0,449018,449018,359543.0,449018,449018.0,449018.0,449018.0,449018.0,449018.0,449018.0,449018,448872,447950
unique,,,365,7,1302,27,,,1791,2,72,,17,3,4,3,10,3,10,2,,25,6555,,6,5,,4,,,,,,,27,150,369
top,,,2021-07-07,domingo,17:30:00,MG,,,BRASILIA,Sim,Velocidade Incompatível,,Colisão traseira,Com Vítimas Feridas,Pleno dia,Crescente,Céu Claro,Simples,Reta,Não,,Automóvel,SR/RANDON SR CA/RANDON SR CA/RANDON SR CA/RAND...,,Condutor,Ileso,,Masculino,,,,,,,SPRF-MG,DEL01-PR,UOP01-DEL01-SP
freq,,,3323,79924,6702,60684,,,6502,279255,55869,,69234,321699,245374,235500,263156,244889,258215,271947,,165554,6775,,268165,158094,,280628,,,,,,,60476,16233,6612
mean,370948.730173,827864.4,,,,,214.081439,279.217411,,,,1.714412,,,,,,,,,669342.901612,,,2010.889033,,,39.078288,,0.352088,0.308181,0.119307,0.046593,-17.856424,-47.14101,,,
std,22768.496844,51011.82,,,,,129.088957,237.3073,,,,1.124472,,,,,,,,,42481.100951,,,7.657329,,,42.743596,,0.477622,0.461742,0.32415,0.210765,7.895926,6.406743,,,
min,331693.0,738204.0,,,,,10.0,0.0,,,,1.0,,,,,,,,,595095.0,,,1900.0,,,0.0,,0.0,0.0,0.0,0.0,-33.689819,-72.11938,,,
25%,352086.0,784965.0,,,,,101.0,83.5,,,,1.0,,,,,,,,,633504.0,,,2008.0,,,27.0,,0.0,0.0,0.0,0.0,-23.618093,-51.115413,,,
50%,370970.0,827781.0,,,,,163.0,217.0,,,,1.0,,,,,,,,,669808.5,,,2012.0,,,37.0,,0.0,0.0,0.0,0.0,-19.77654,-47.81714,,,
75%,389188.0,869845.0,,,,,343.0,433.1,,,,2.0,,,,,,,,,704382.0,,,2017.0,,,48.0,,1.0,1.0,0.0,0.0,-11.435201,-43.023224,,,


In [84]:
# Basic filtering and data type conversion

dtypes3 = {
    "id": int,
    # IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
    # "Int64": Nullable integer data type
    "pesid": "Int64",
    # Change it to a more specific time-related type if needed (e.g. "datetime64[ns]")
    "data_inversa": "category",
    "dia_semana": "category",
    # Change it to a more specific time-related type if needed
    "horario": "category",
    "uf": "category",
    "br": "category",
    # TODO: category or float64?
    "km": "float64",
    "municipio": "category",
    "causa_principal": "category",
    "causa_acidente": "category",
    "ordem_tipo_acidente": int,
    "tipo_acidente": "category",
    "classificacao_acidente": "category",
    "fase_dia": "category",
    "sentido_via": "category",
    "condicao_metereologica": "category",
    "tipo_pista": "category",
    "tracado_via": "category",
    "uso_solo": "category",
    "id_veiculo": int,
    "tipo_veiculo": "category",
    "marca": "category",
    # IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
    # "Int64": Nullable integer data type
    "ano_fabricacao_veiculo": "Int64",
    "tipo_envolvido": "category",
    "estado_fisico": "category",
    # IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer
    # "Int64": Nullable integer data type
    "idade": "Int64",
    "sexo": "category",
    "ilesos": int,
    "feridos_leves": int,
    "feridos_graves": int,
    "mortos": int,
    "latitude": "float64",
    "longitude": "float64",
    "regional": "category",
    "delegacia": "category",
    "uop": "category"
}

df3 = df3[df3.uf.isin(["PR", "SC", "RS"])]
df3 = df3.astype(dtypes3)

In [87]:
described3 = df3.describe(include="all")
described3.loc["nan"] = df3.isnull().mean() * 100
described3

Unnamed: 0,id,pesid,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_principal,causa_acidente,ordem_tipo_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,id_veiculo,tipo_veiculo,marca,ano_fabricacao_veiculo,tipo_envolvido,estado_fisico,idade,sexo,ilesos,feridos_leves,feridos_graves,mortos,latitude,longitude,regional,delegacia,uop
count,109799.0,101652.0,109799,109799,109799,109799,109571.0,109571.0,109799,109799,109799,109799.0,109799,109799,109799,109799,109799,109799,109799,109799,109799.0,109799,105818,105004.0,109799,109799,90572.0,109799,109799.0,109799.0,109799.0,109799.0,109799.0,109799.0,109799,109799,108997
unique,,,365,7,903,3,29.0,,445,2,71,,17,3,4,3,10,3,10,2,,23,4570,,6,5,,4,,,,,,,3,29,93
top,,,2021-06-02,sábado,17:30:00,PR,101.0,,CURITIBA,Sim,Velocidade Incompatível,,Colisão traseira,Com Vítimas Feridas,Pleno dia,Crescente,Céu Claro,Simples,Reta,Não,,Automóvel,VW/GOL 1.0/GOL 1.0/GOL 1.0/GOL 1.0/GOL 1.0/GOL...,,Condutor,Ileso,,Masculino,,,,,,,SPRF-PR,DEL01-PR,UOP01-DEL01-SC
freq,,,1109,19637,1972,42176,18466.0,,4235,77979,15157,,18239,82290,59838,58013,59422,56383,58658,61621,,47494,1226,,69728,41426,,69068,,,,,,,42079,16233,4263
mean,370187.840363,826168.967831,,,,,,230.553164,,,,1.593739,,,,,,,,,668074.641791,,,2009.665013,,,38.569017,,0.377289,0.322061,0.107606,0.03796,-26.968303,-50.804915,,,
std,21824.596364,49565.051969,,,,,,181.026074,,,,0.954048,,,,,,,,,41388.098931,,,8.241989,,,45.819972,,0.48471,0.467268,0.309883,0.191101,2.032691,1.850636,,,
min,331693.0,738216.0,,,,,,0.0,,,,1.0,,,,,,,,,595103.0,,,1900.0,,,0.0,,0.0,0.0,0.0,0.0,-33.689819,-57.502806,,,
25%,351768.0,784240.75,,,,,,100.0,,,,1.0,,,,,,,,,632905.0,,,2006.0,,,26.0,,0.0,0.0,0.0,0.0,-28.455675,-52.046432,,,
50%,370439.0,826280.0,,,,,,184.6,,,,1.0,,,,,,,,,668645.0,,,2011.0,,,36.0,,0.0,0.0,0.0,0.0,-26.881372,-50.651211,,,
75%,388768.0,868731.0,,,,,,330.8,,,,2.0,,,,,,,,,703668.5,,,2016.0,,,47.0,,1.0,1.0,0.0,0.0,-25.478651,-49.100332,,,


In [89]:
df3.head()

Unnamed: 0,id,pesid,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_principal,causa_acidente,ordem_tipo_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,id_veiculo,tipo_veiculo,marca,ano_fabricacao_veiculo,tipo_envolvido,estado_fisico,idade,sexo,ilesos,feridos_leves,feridos_graves,mortos,latitude,longitude,regional,delegacia,uop
0,331693,739775,2021-01-01,sexta-feira,00:02:00,SC,101.0,314.9,LAGUNA,Sim,Ingestão de álcool pelo condutor,3,Capotamento,Com Vítimas Feridas,Plena Noite,Crescente,Nublado,Dupla,Reta,Não,595104,Automóvel,FIAT/UNO VIVACE 1.0/UNO VIVACE 1.0/UNO VIVACE ...,2010,Condutor,Ileso,35,Masculino,1,0,0,0,-28.430294,-48.849251,SPRF-SC,DEL02-SC,UOP01-DEL02-SC
1,331693,739774,2021-01-01,sexta-feira,00:02:00,SC,101.0,314.9,LAGUNA,Sim,Ingestão de álcool pelo condutor,3,Capotamento,Com Vítimas Feridas,Plena Noite,Crescente,Nublado,Dupla,Reta,Não,595103,Automóvel,I/RENAULT CLIO CAM 10H3P/RENAULT CLIO CAM 10H3...,2010,Passageiro,Lesões Leves,22,Feminino,0,1,0,0,-28.430294,-48.849251,SPRF-SC,DEL02-SC,UOP01-DEL02-SC
2,331693,738219,2021-01-01,sexta-feira,00:02:00,SC,101.0,314.9,LAGUNA,Sim,Ingestão de álcool pelo condutor,3,Capotamento,Com Vítimas Feridas,Plena Noite,Crescente,Nublado,Dupla,Reta,Não,595103,Automóvel,I/RENAULT CLIO CAM 10H3P/RENAULT CLIO CAM 10H3...,2010,Condutor,Ileso,27,Masculino,1,0,0,0,-28.430294,-48.849251,SPRF-SC,DEL02-SC,UOP01-DEL02-SC
3,331693,739775,2021-01-01,sexta-feira,00:02:00,SC,101.0,314.9,LAGUNA,Sim,Ingestão de álcool pelo condutor,2,Colisão com objeto,Com Vítimas Feridas,Plena Noite,Crescente,Nublado,Dupla,Reta,Não,595104,Automóvel,FIAT/UNO VIVACE 1.0/UNO VIVACE 1.0/UNO VIVACE ...,2010,Condutor,Ileso,35,Masculino,1,0,0,0,-28.430294,-48.849251,SPRF-SC,DEL02-SC,UOP01-DEL02-SC
4,331693,739774,2021-01-01,sexta-feira,00:02:00,SC,101.0,314.9,LAGUNA,Sim,Ingestão de álcool pelo condutor,2,Colisão com objeto,Com Vítimas Feridas,Plena Noite,Crescente,Nublado,Dupla,Reta,Não,595103,Automóvel,I/RENAULT CLIO CAM 10H3P/RENAULT CLIO CAM 10H3...,2010,Passageiro,Lesões Leves,22,Feminino,0,1,0,0,-28.430294,-48.849251,SPRF-SC,DEL02-SC,UOP01-DEL02-SC


In [90]:
# Profile report

profile3 = ProfileReport(df3, title="Profiling Report")
profile3.to_file("../output/b-data-understanding/profile3.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [93]:
df3.to_csv(f"../data/b-data-understanding/{YEAR}-grouped-by-people-all.csv", index=False)