In [13]:
# DATA PREPARATION
# --------------------------------------------------------------------------------------

# Select data: Determine which data sets will be used and document reasons for 
# inclusion/exclusion.
# 
# Clean data: Often this is the lengthiest task. Without it, you’ll likely fall victim
# to garbage-in, garbage-out. A common practice during this task is to correct, impute,
# or remove erroneous values.
# 
# Construct data: Derive new attributes that will be helpful. For example, derive 
# someone’s body mass index from height and weight fields.
# 
# Integrate data: Create new data sets by combining data from multiple sources.
# 
# Format data: Re-format data as necessary. For example, you might convert string values
# that store numbers to numeric values so that you can perform mathematical operations.

In [14]:
import os

import pandas as pd
from ydata_profiling import ProfileReport

pd.set_option("display.max_columns", None)
YEAR = "2021"

os.makedirs(f"../output/c-data-preparation", exist_ok=True)
os.makedirs(f"../data/c-data-preparation", exist_ok=True)

In [15]:
# Read data

file = f"../data/b-data-understanding/{YEAR}-grouped-by-occurrence.csv"

df = pd.read_csv(file)

described = df.describe(include="all")
described.loc["nan"] = df.isnull().mean() * 100
described

Unnamed: 0,id,data_inversa,dia_semana,horario,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,pessoas,mortos,feridos_leves,feridos_graves,ilesos,ignorados,feridos,veiculos,latitude,longitude,regional,delegacia,uop
count,19770.0,19770,19770,19770,19770,19727.0,19727.0,19770,19770,19770,19770,19770,19770,19770,19770,19770,19770,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770.0,19770,19770,19669
unique,,365,7,903,3,,,445,69,17,3,4,3,10,3,10,2,,,,,,,,,,,3,29,93
top,,2021-08-08,sábado,19:00:00,SC,,,CURITIBA,Velocidade Incompatível,Colisão traseira,Com Vítimas Feridas,Pleno dia,Crescente,Céu Claro,Simples,Reta,Não,,,,,,,,,,,SPRF-SC,DEL01-PR,UOP01-DEL01-SC
freq,,101,3403,254,7890,,,900,2329,3888,14922,10610,10305,10885,9455,11398,10363,,,,,,,,,,,7918,3032,1035
mean,369542.644461,,,,,256.400568,221.504648,,,,,,,,,,,2.291047,0.062873,0.837683,0.246232,1.00607,0.138189,1.083915,1.686141,-26.936908,-50.60901,,,
std,21759.173611,,,,,133.527176,179.519283,,,,,,,,,,,1.639392,0.310085,0.944891,0.565921,1.251622,0.399331,1.047696,0.71906,1.969246,1.856358,,,
min,331693.0,,,,,101.0,0.0,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-33.689819,-57.502806,,,
25%,350965.75,,,,,116.0,92.0,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-28.19626,-51.914951,,,
50%,369583.0,,,,,277.0,173.0,,,,,,,,,,,2.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,-26.880509,-50.158686,,,
75%,388376.75,,,,,376.0,313.0,,,,,,,,,,,3.0,0.0,1.0,0.0,1.0,0.0,1.0,2.0,-25.502576,-48.956708,,,


In [16]:
# Select features
# --------------------------------------------------------------------------------------

# Only some features will be used for clustering. However, other features are also
# interesting for further analysis, when the clusters are known. Therefore, we will keep
# all of them in the dataset.

In [17]:
# Gather additional data
# --------------------------------------------------------------------------------------

In [18]:
# Clean data
# --------------------------------------------------------------------------------------

# 1. Drop nans

n1 = len(df)
df = df.dropna()
n2 = len(df)

df = df.reset_index(drop=True)

# TODO: impute missing values
# 
# Only 3 columns with missing values: br, km, uop
# They can be imputed by considering lat, long, and city features

print(f"Number of rows dropped: {n1 - n2}")

# 2. Drop useless columns

df = df.drop(columns=["id"])

# 3. Format text data

# TODO: don't think it's necessary, text looks consistent (e.g. no typos)
# 
# str: lower case
# str: remove punctuation
# str: remove special characters
# str: remove accents
# str: remove extra spaces

Number of rows dropped: 139


In [19]:
# Feature engineering
# --------------------------------------------------------------------------------------

df.data_inversa = pd.to_datetime(df.data_inversa, format="%Y-%m-%d").dt.date
df.horario = pd.to_datetime(df.horario, format="%H:%M:%S").dt.time

func = lambda row: pd.Timestamp.combine(row.data_inversa, row.horario)
df["timestamp"] = df.apply(func, axis=1)

df["ano"] = df.timestamp.dt.year
df["mes"] = df.timestamp.dt.month
df["dia"] = df.timestamp.dt.day
# Already have "dia_semana" column
# df["dia_semana"] = df.data_inversa.dt.dayofweek

df["hora"] = df.timestamp.dt.hour
df["minuto"] = df.timestamp.dt.minute

df = df.drop(columns=["data_inversa", "horario", "timestamp"])

In [20]:
# Data type conversion
# --------------------------------------------------------------------------------------

dtypes = {
    # "id": "Int64",
    # "data_inversa": "datetime64[ns]",
    "dia_semana": "category",
    # "horario": "datetime64[ns]",
    "uf": "category",
    "br": "category",
    "km": "float64",
    "municipio": "category",
    "causa_acidente": "category",
    "tipo_acidente": "category",
    "classificacao_acidente": "category",
    "fase_dia": "category",
    "sentido_via": "category",
    "condicao_metereologica": "category",
    "tipo_pista": "category",
    "tracado_via": "category",
    "uso_solo": "category",
    "pessoas": "Int64",
    "mortos": "Int64",
    "feridos_leves": "Int64",
    "feridos_graves": "Int64",
    "ilesos": "Int64",
    "ignorados": "Int64",
    "feridos": "Int64",
    "veiculos": "Int64",
    "latitude": "float64",
    "longitude": "float64",
    "regional": "category",
    "delegacia": "category",
    "uop": "category",
    "ano": "Int64",
    "dia": "Int64",
    "mes": "Int64",
    "hora": "Int64",
    "minuto": "Int64",
}

df = df.astype(dtypes)

In [21]:
df.to_csv("../data/c-data-preparation/dataset.csv", index=False)

In [22]:
# Encoding and normalization
# --------------------------------------------------------------------------------------

import pickle

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

# Check if all columns have allowed data types
allowed_dtypes = ["category", "Int64", "int64", "float64"]
msg = "Invalid data type. Allowed data types: category, Int64, float64"
assert all(df[col].dtype.name in allowed_dtypes for col in df.columns), msg

# Encode categorical columns
# --------------------------------------------------------------------------------------
label_encoders = {}

for column in df.select_dtypes(include=["category"]):
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

with open("../output/c-data-preparation/label-encoders.pkl", "wb") as fp:
    pickle.dump(label_encoders, fp)

# Load the label encoders from the pickle file
# with open("../output/c-data-preparation/label-encoders.pkl", "rb") as fp:
#     label_encoders = pickle.load(fp)

# for column in df.select_dtypes(include=["category"]):
#     df[column] = label_encoders[column].inverse_transform(df[column].astype(int))

# Normalize all columns
# --------------------------------------------------------------------------------------

# NOTE: # It's better to use StandardScaler instead of MinMaxScaler for lat and long.
# The reason is that latitude and longitude represent angular distances on the Earth's
# surface, and Min-Max scaling would distort these angular distances.

# scaler = MinMaxScaler()
scaler = StandardScaler()
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

with open("../output/c-data-preparation/min-max-scaler.pkl", "wb") as fp:
    pickle.dump(scaler, fp)

# Load the scaler from the pickle file
# with open("../output/c-data-preparation/min-max-scaler.pkl", "rb") as fp:
#     scaler = pickle.load(fp)

df.head()

Unnamed: 0,dia_semana,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,pessoas,mortos,feridos_leves,feridos_graves,ilesos,ignorados,feridos,veiculos,latitude,longitude,regional,delegacia,uop,ano,mes,dia,hora,minuto
0,0.504431,-1.179492,-0.281066,-0.94545,1.328328,0.723221,1.301798,-0.287018,0.776219,1.046786,-0.263606,-1.115796,-1.879257,-0.953745,1.041751,-0.202223,0.170337,-0.433989,1.593202,-0.345792,-0.080292,1.827446,0.686189,0.914287,-1.183359,-1.231548,1.788331,0.0,-1.637935,-1.660056,0.313374,1.139094
1,0.504431,1.102312,1.705964,-0.790699,-0.222842,1.334716,-1.102036,-2.390991,0.776219,-0.955305,1.464914,0.998406,0.598758,-0.953745,4.087807,6.242317,4.403097,1.336876,-0.802581,2.157246,4.693209,1.827446,-0.002548,0.699267,1.09929,0.149601,-0.985191,0.0,-1.637935,-1.660056,0.634235,-0.869757
2,0.504431,1.102312,1.705964,-0.50736,0.064412,-1.460689,-0.227915,-0.287018,-0.516777,1.046786,1.464914,0.998406,-0.392448,1.048498,1.041751,-0.202223,-0.887853,3.107741,0.794608,-0.345792,0.874408,0.436581,-0.108575,0.564696,1.09929,0.149601,0.073127,0.0,-1.637935,-1.660056,0.955095,1.426073
3,0.504431,-1.179492,-0.902013,-0.902587,-1.158468,-0.281378,-1.757627,-2.390991,-0.516777,-0.955305,-0.955013,-1.115796,-0.392448,1.048498,-0.176672,3.020047,-0.887853,-0.433989,-0.003987,-0.345792,-1.034992,-0.954285,0.834357,0.894492,-1.183359,-1.231548,-0.328304,0.0,-1.637935,-1.660056,1.115525,0.852116
4,1.00491,1.102312,-1.026202,-0.607558,-0.148977,1.24736,-0.009384,-0.287018,0.776219,1.046786,1.810617,-1.115796,-0.392448,-0.953745,1.041751,-0.202223,0.170337,1.336876,0.794608,-0.345792,0.874408,1.827446,0.05111,1.014225,1.09929,0.149601,1.532875,0.0,-1.637935,-1.54641,-0.649206,-0.008821


In [23]:
described = df.describe(include="all")
described.loc["nan"] = df.isnull().mean() * 100
described

Unnamed: 0,dia_semana,uf,br,km,municipio,causa_acidente,tipo_acidente,classificacao_acidente,fase_dia,sentido_via,condicao_metereologica,tipo_pista,tracado_via,uso_solo,pessoas,mortos,feridos_leves,feridos_graves,ilesos,ignorados,feridos,veiculos,latitude,longitude,regional,delegacia,uop,ano,mes,dia,hora,minuto
count,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0,19631.0
mean,7.962885000000001e-17,6.623673e-17,-4.7415360000000005e-17,-5.791189e-17,9.193513000000001e-17,0.0,4.832024e-17,-9.917412e-17,-1.4477970000000002e-17,2.099306e-17,-7.311377e-17,4.198612e-17,-6.515088000000001e-17,5.791189e-17,7.238987e-17,7.238987e-17,5.2120700000000006e-17,-5.0672910000000005e-17,-4.343392e-17,-1.447797e-18,3.329934e-17,5.2120700000000006e-17,1.690303e-15,2.823205e-16,-2.9679850000000005e-17,-1.737357e-17,3.329934e-17,0.0,4.632951e-17,5.2120700000000006e-17,-3.329934e-17,-4.198612e-17
std,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,1.000025,0.0,1.000025,1.000025,1.000025,1.000025
min,-1.497485,-1.179492,-1.026202,-1.231015,-1.708354,-1.460689,-1.976158,-2.390991,-3.102769,-0.9553048,-0.9550132,-1.115796,-1.879257,-0.9537454,-0.7858828,-0.2022228,-0.8878534,-0.4339886,-0.802581,-0.3457918,-1.034992,-0.9542851,-3.424008,-3.711855,-1.183359,-1.231548,-1.386621,0.0,-1.637935,-1.660056,-2.093078,-1.443715
25%,-0.9970063,-1.179492,-0.9020131,-0.7194466,-0.8383855,-0.892873,-0.6649754,-0.2870175,-0.5167769,-0.9553048,-0.6093093,-1.115796,-0.392448,-0.9537454,-0.7858828,-0.2022228,-0.8878534,-0.4339886,-0.802581,-0.3457918,-0.08029189,-0.9542851,-0.6409515,-0.7065267,-1.183359,-0.9804298,-0.9486969,0.0,-0.769971,-0.8645367,-0.8096366,-0.8697574
50%,0.003951619,-0.03858993,-0.2810662,-0.2735639,-0.03407527,0.242761,-0.009384194,-0.2870175,0.7762193,-0.9553048,-0.6093093,-0.05869493,0.5987578,-0.9537454,-0.1766716,-0.2022228,0.1703368,-0.4339886,-0.003986666,-0.3457918,-0.08029189,0.4365807,0.03229355,0.254351,-0.04203442,-0.1015166,-0.1458354,0.0,0.09799282,-0.06901751,0.1529443,0.1059706
75%,1.00491,1.102312,0.8366383,0.5088184,0.8441001,0.941613,1.301798,-0.2870175,0.7762193,1.046786,1.464914,0.9984061,0.5987578,1.048498,0.4325397,-0.2022228,0.1703368,-0.4339886,-0.003986666,-0.3457918,-0.08029189,0.4365807,0.7297472,0.8900444,1.09929,0.9029556,0.912482,0.0,0.9659566,0.8401472,0.794665,0.8521156
max,1.505389,1.102312,2.4511,2.838152,1.894628,1.509429,1.520329,1.816956,0.7762193,1.046786,2.156321,0.9984061,2.581169,1.048498,31.50231,61.02091,35.09061,26.12898,35.93276,14.67244,33.33422,14.34524,2.117075,1.12822,1.09929,2.284105,1.934306,0.0,1.544599,1.749312,1.596816,1.942635
,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
df.to_csv("../data/c-data-preparation/prepared.csv", index=False)

In [25]:
# Profile report

# profile = ProfileReport(df, title="Profiling Report")
# profile.to_file("../output/c-data-preparation/profile.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: 