In [1]:
# DATA PREPARATION
# --------------------------------------------------------------------------------------

# Select data: Determine which data sets will be used and document reasons for 
# inclusion/exclusion.
# 
# Clean data: Often this is the lengthiest task. Without it, you’ll likely fall victim
# to garbage-in, garbage-out. A common practice during this task is to correct, impute,
# or remove erroneous values.
# 
# Construct data: Derive new attributes that will be helpful. For example, derive 
# someone’s body mass index from height and weight fields.
# 
# Integrate data: Create new data sets by combining data from multiple sources.
# 
# Format data: Re-format data as necessary. For example, you might convert string values
# that store numbers to numeric values so that you can perform mathematical operations.

In [2]:
import os

import pandas as pd
from ydata_profiling import ProfileReport

pd.set_option("display.max_columns", None)
YEAR = "2021"

os.makedirs(f"../output/c-data-preparation", exist_ok=True)
os.makedirs(f"../data/c-data-preparation", exist_ok=True)

In [3]:
# Read data

file = f"../data/b-data-understanding/{YEAR}-grouped-by-occurrence.csv"

df = pd.read_csv(file)

described = df.describe(include="all")
described.loc["nan"] = df.isnull().mean() * 100
described

Unnamed: 0,id,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,pessoas,mortos,feridos_leves,feridos_graves,ilesos,ignorados,feridos,veiculos,latitude,longitude,regional,delegacia,uop
count,19770.0,19770,19770,19770,19770,19727.0,19727.0,19770,19770,19770,19770,19770,19770,19770,19770,19770,19770,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770,19770,19669
unique,,365,7,903,3,,,445,69,17,3,4,3,10,3,10,2,,,,,,,,,,,3,29,93
top,,2021-08-08,sábado,19:00:00,SC,,,CURITIBA,Velocidade Incompatível,Colisão traseira,Com Vítimas Feridas,Pleno dia,Crescente,Céu Claro,Simples,Reta,Não,,,,,,,,,,,SPRF-SC,DEL01-PR,UOP01-DEL01-SC
freq,,101,3403,254,7890,,,900,2329,3888,14922,10610,10305,10885,9455,11398,10363,,,,,,,,,,,7918,3032,1035
mean,369542.644461,,,,,256.400568,221.504648,,,,,,,,,,,2.291047,0.062873,0.837683,0.246232,1.00607,0.138189,1.083915,1.686141,-26.936908,-50.60901,,,
std,21759.173611,,,,,133.527176,179.519283,,,,,,,,,,,1.639392,0.310085,0.944891,0.565921,1.251622,0.399331,1.047696,0.71906,1.969246,1.856358,,,
min,331693.0,,,,,101.0,0.0,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-33.689819,-57.502806,,,
25%,350965.75,,,,,116.0,92.0,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-28.19626,-51.914951,,,
50%,369583.0,,,,,277.0,173.0,,,,,,,,,,,2.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,-26.880509,-50.158686,,,
75%,388376.75,,,,,376.0,313.0,,,,,,,,,,,3.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,-25.502576,-48.956708,,,


In [4]:
# Select features
# --------------------------------------------------------------------------------------

# Only some features will be used for clustering. However, other features are also
# interesting for further analysis, when the clusters are known. Therefore, we will keep
# all of them in the dataset.

In [5]:
# Gather additional data
# --------------------------------------------------------------------------------------

In [6]:
# Clean data
# --------------------------------------------------------------------------------------

# 1. Drop nans

n1 = len(df)
df = df.dropna()
n2 = len(df)

df = df.reset_index(drop=True)

# TODO: impute missing values
# 
# Only 3 columns with missing values: br, km, uop
# They can be imputed by considering lat, long, and city features

print(f"Number of rows dropped: {n1 - n2}")

# 2. Drop useless columns

df = df.drop(columns=["id"])

# 3. Format text data

# TODO: don't think it's necessary, text looks consistent (e.g. no typos)
# 
# str: lower case
# str: remove punctuation
# str: remove special characters
# str: remove accents
# str: remove extra spaces

Number of rows dropped: 139


In [7]:
# Feature engineering
# --------------------------------------------------------------------------------------

df.data_inversa = pd.to_datetime(df.data_inversa, format="%Y-%m-%d").dt.date
df.horario = pd.to_datetime(df.horario, format="%H:%M:%S").dt.time

func = lambda row: pd.Timestamp.combine(row.data_inversa, row.horario)
df["timestamp"] = df.apply(func, axis=1)

df["ano"] = df.timestamp.dt.year
df["mes"] = df.timestamp.dt.month
df["dia"] = df.timestamp.dt.day
# Already have "dia_semana" column
# df["dia_semana"] = df.data_inversa.dt.dayofweek

df["hora"] = df.timestamp.dt.hour
df["minuto"] = df.timestamp.dt.minute

df = df.drop(columns=["data_inversa", "horario", "timestamp"])

In [8]:
# Data type conversion
# --------------------------------------------------------------------------------------

dtypes = {
    # "id": "Int64",
    # "data_inversa": "datetime64[ns]",
    "dia_semana": "category",
    # "horario": "datetime64[ns]",
    "uf": "category",
    "br": "category",
    "km": "float64",
    "municipio": "category",
    "causa_acidente": "category",
    "tipo_acidente": "category",
    "classificacao_acidente": "category",
    "fase_dia": "category",
    "sentido_via": "category",
    "condicao_metereologica": "category",
    "tipo_pista": "category",
    "tracado_via": "category",
    "uso_solo": "category",
    "pessoas": "Int64",
    "mortos": "Int64",
    "feridos_leves": "Int64",
    "feridos_graves": "Int64",
    "ilesos": "Int64",
    "ignorados": "Int64",
    "feridos": "Int64",
    "veiculos": "Int64",
    "latitude": "float64",
    "longitude": "float64",
    "regional": "category",
    "delegacia": "category",
    "uop": "category",
    "ano": "Int64",
    "dia": "Int64",
    "mes": "Int64",
    "hora": "Int64",
    "minuto": "Int64",
}

df = df.astype(dtypes)

# Save non-preprocessed data
df.to_csv(f"../data/c-data-preparation/{YEAR}-raw.csv", index=False)

In [9]:
# Encoding and normalization
# --------------------------------------------------------------------------------------

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Check if all columns have allowed data types
allowed_dtypes = ["category", "Int64", "int64", "float64"]
msg = "Invalid data type. Allowed data types: category, Int64, float64"
assert all(df[col].dtype.name in allowed_dtypes for col in df.columns), msg

# NOTE: no need to save label encoder or scaler, since after clustering we add a new
# "cluster" column to the non-preprocessed data 

# Encode categorical columns
# --------------------------------------------------------------------------------------

for column in df.select_dtypes(include=["category"]):
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])

# Normalize all columns
# --------------------------------------------------------------------------------------

# NOTE: # It's better to use StandardScaler instead of MinMaxScaler for lat and long.
# The reason is that latitude and longitude represent angular distances on the Earth's
# surface, and Min-Max scaling would distort these angular distances.

scaler = MinMaxScaler()
# scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

df.head()

Unnamed: 0,dia_semana,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,pessoas,mortos,feridos_leves,feridos_graves,ilesos,ignorados,feridos,veiculos,latitude,longitude,regional,delegacia,uop,ano,mes,dia,hora,minuto
0,0.666667,0.0,0.214286,0.070178,0.842825,0.735294,0.9375,0.5,1.0,1.0,0.222222,0.0,0.0,0.0,0.056604,0.0,0.029412,0.0,0.065217,0.0,0.027778,0.181818,0.741768,0.9558,0.0,0.0,0.956044,0.0,0.0,0.0,0.652174,0.762712
1,0.666667,1.0,0.785714,0.108208,0.412301,0.941176,0.25,0.0,1.0,0.0,0.777778,1.0,0.555556,0.0,0.150943,0.105263,0.147059,0.066667,0.0,0.166667,0.166667,0.181818,0.617471,0.911375,1.0,0.392857,0.120879,0.0,0.0,0.0,0.73913,0.169492
2,0.666667,1.0,0.785714,0.177839,0.492027,0.0,0.5,0.5,0.666667,1.0,0.777778,1.0,0.333333,1.0,0.056604,0.0,0.0,0.133333,0.043478,0.0,0.055556,0.090909,0.598337,0.883571,1.0,0.392857,0.43956,0.0,0.0,0.0,0.826087,0.847458
3,0.666667,0.0,0.035714,0.080711,0.15262,0.397059,0.0625,0.0,0.666667,0.0,0.0,0.0,0.333333,1.0,0.018868,0.052632,0.0,0.0,0.021739,0.0,0.0,0.0,0.768508,0.95171,0.0,0.0,0.318681,0.0,0.0,0.0,0.869565,0.677966
4,0.833333,1.0,0.0,0.153215,0.432802,0.911765,0.5625,0.5,1.0,1.0,0.888889,0.0,0.333333,0.0,0.056604,0.0,0.029412,0.066667,0.043478,0.0,0.055556,0.181818,0.627155,0.976448,1.0,0.392857,0.879121,0.0,0.0,0.033333,0.391304,0.423729


In [10]:
described = df.describe(include="all")
described.loc["nan"] = df.isnull().mean() * 100
described

Unnamed: 0,dia_semana,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,pessoas,mortos,feridos_leves,feridos_graves,ilesos,ignorados,feridos,veiculos,latitude,longitude,regional,delegacia,uop,ano,mes,dia,hora,minuto
count,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0
mean,0.498684,0.516912,0.295115,0.302523,0.47415,0.491795,0.565184,0.568208,0.799891,0.477153,0.306946,0.527762,0.421318,0.476338,0.02434,0.003303,0.024677,0.016338,0.021848,0.023025,0.030114,0.062374,0.617931,0.7669,0.518415,0.350304,0.41754,0.0,0.514664,0.48691,0.567246,0.426334
std,0.333023,0.438261,0.287587,0.245757,0.277555,0.336695,0.286009,0.237652,0.257806,0.49949,0.321414,0.473004,0.2242,0.499453,0.030972,0.016334,0.027795,0.037647,0.027222,0.066587,0.029097,0.065363,0.180475,0.206614,0.438099,0.284449,0.301128,0.0,0.314223,0.293317,0.271017,0.295311
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.166667,0.0,0.035714,0.125718,0.241458,0.191176,0.375,0.5,0.666667,0.0,0.111111,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,0.502259,0.620926,0.0,0.071429,0.131868,0.0,0.272727,0.233333,0.347826,0.169492
50%,0.5,0.5,0.214286,0.235294,0.464692,0.573529,0.5625,0.5,1.0,0.0,0.111111,0.5,0.555556,0.0,0.018868,0.0,0.029412,0.0,0.021739,0.0,0.027778,0.090909,0.623759,0.819451,0.5,0.321429,0.373626,0.0,0.545455,0.466667,0.608696,0.457627
75%,0.833333,1.0,0.535714,0.427565,0.708428,0.808824,0.9375,0.5,1.0,1.0,0.777778,1.0,0.555556,1.0,0.037736,0.0,0.029412,0.0,0.021739,0.0,0.027778,0.090909,0.749629,0.950791,1.0,0.607143,0.692308,0.0,0.818182,0.733333,0.782609,0.677966
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df.to_csv(f"../data/c-data-preparation/{YEAR}-preprocessed.csv", index=False)

In [12]:
# Profile report

profile = ProfileReport(df, title="Profiling Report")
profile.to_file(f"../output/c-data-preparation/{YEAR}-profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]