In [16]:
##### This notebook allows to predict prioritization clusters using the saved MinMax scaler and K-Prototypes model and new
    ##### 3-year raw data. However, for illustrative purposes, we could use the same data used to estimate the model  

import pandas as pd
import joblib
from data_creation import data_for_clustering

##### The methodology works by bringing 3-year data. Before running the script, the person who runs it can modify the date
    ##### information below (year, month, day) and the script would automatically bring the 3-year data up to such date
cluster_df = data_for_clustering(2022, 12, 31)
##### Alternatively, for illustrative purposes, if we saved the raw data generated by the model_creation notebook, we could
    ##### simply load it
#cluster_df = pd.read_csv("raw_data_predict.csv")

In [17]:
#### We load the saved MinMax scaler
scaler = joblib.load("scaler.mod")

##### We take the features we use for the prediction
data_cluster = cluster_df[["HORARIO", "accidentes", "muertes", "heridos", "vulnerables"]].copy()

##### We scale the continuous features
data_cluster[["accidentes", "muertes", "heridos"]] = scaler.fit_transform(data_cluster[["accidentes", "muertes", \
    "heridos"]])

"""
# This implementation delivers the same results as the line of code above
cols = ["accidentes", "muertes", "heridos"]
maxv = scaler.data_max_.tolist()
minv = scaler.data_min_.tolist()
for i in range(0, len(cols)):
    data_cluster[cols[i]] = (data_cluster[cols[i]] - minv[i]) / (maxv[i] - minv[i])
"""

In [18]:
#### We load the saved K-Prototypes model
clusterer = joblib.load("kprototypes.mod")

##### We run the prediction    
clusters = clusterer.predict(data_cluster, categorical = [0, 4])

##### We append the predictions to the data
cluster_df = pd.concat((cluster_df, pd.DataFrame(clusters)), axis = 1)

##### When the labels are converted into a DataFrame, the column is called 0. We rename it
cluster_df.rename({0: "Prioridad"}, axis = 1, inplace = True)
##### From the analyses run when creating the model, we know what the labels in "clusters" represent. However, if the
    ##### K-Prototypes model is re-estimated, "dictp" must be revised
dictp = {0: "1 Priorizado", 1: "2 Complementario", 2: "3 NA"}
cluster_df.replace({"Prioridad": dictp}, inplace = True)
datap = cluster_df.sort_values(by = ["Prioridad", "vulnerables", "muertes", "muertes_vulnerables", "heridos_vulnerables"], \
    ascending = [True, False, False, False, False])
datap.to_csv("prioritized_corridors.csv", index = False)

#del cluster_df, data_cluster, datap

In [22]:
datap.head()

Unnamed: 0,MVINOMBRE,HORARIO,accidentes,muertes,heridos,muertes_vulnerables,heridos_vulnerables,vulnerables,Prioridad
58,AVENIDA BOYACA,Nocturno 22-2,284,24,249,22,145,2,1 Priorizado
74,AVENIDA CARACAS,DiurnoTarde 12-18,507,15,427,15,265,2,1 Priorizado
77,AVENIDA CARACAS,NocturnoTarde 18-22,297,15,287,12,206,2,1 Priorizado
113,AVENIDA CIUDAD DE CALI,DiurnoMan 5-8,258,14,187,13,138,2,1 Priorizado
56,AVENIDA BOYACA,DiurnoTarde 12-18,1068,13,452,11,300,2,1 Priorizado
