### Model Training
El objetivo de este notebook es realizar el entrenamiento y el guardado del modelo.

#### Carga de paquetes

In [1]:
import joblib
import numpy as np
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier

#### Configuracion

In [2]:
np.random.seed(79)

 #### Helpers

#### Cargamos los datos

In [3]:
# Importamos el dataset de features
df = pd.read_csv("../data/processed/dataset_training_v0.csv")
df.shape

(23073, 21)

In [9]:
df.loc[df["aprobo"] == 0].iloc[0].to_dict()

{'particion': 8,
 'split': 'train',
 'al_promedio_general': 4.75,
 'al_tasa_aprobacion': 0.5,
 'al_promedio_parcial': 6.5,
 'al_promedio_score_tp': 84.8,
 'al_promedio_tiempo_sub_tp': 12.5,
 'al_tasa_recuperadas': 0.0,
 'al_n_materias_periodo': 4,
 'cn_promedio_general': 6.15,
 'cn_tasa_aprobacion': 0.85,
 'cn_promedio_parcial': 6.690476190476191,
 'cn_promedio_score_tp': 82.13597122302158,
 'cn_promedio_tiempo_sub_tp': 11.244604316546765,
 'cn_tasa_recuperadas': 0.15,
 'promedio_parcial': 0.0,
 'promedio_recuperatorio': 0.0,
 'promedio_integrador': 0.0,
 'promedio_tareas_tp': 0.0,
 'promedio_tiempo_sub_tp': 0.0,
 'aprobo': 0}

In [5]:
df.iloc[0].to_dict()

{'particion': 24,
 'split': 'train',
 'al_promedio_general': 6.833333333333333,
 'al_tasa_aprobacion': 1.0,
 'al_promedio_parcial': 6.333333333333333,
 'al_promedio_score_tp': 74.45081300813008,
 'al_promedio_tiempo_sub_tp': 21.634146341463413,
 'al_tasa_recuperadas': 0.1666666666666666,
 'al_n_materias_periodo': 6,
 'cn_promedio_general': 8.0,
 'cn_tasa_aprobacion': 1.0,
 'cn_promedio_parcial': 8.5,
 'cn_promedio_score_tp': 79.43854166666667,
 'cn_promedio_tiempo_sub_tp': 26.0,
 'cn_tasa_recuperadas': 0.0,
 'promedio_parcial': 0.0,
 'promedio_recuperatorio': 0.0,
 'promedio_integrador': 0.0,
 'promedio_tareas_tp': 81.76666666666667,
 'promedio_tiempo_sub_tp': 33.0,
 'aprobo': 1}

#### División del conjunto de entrenamiento

In [6]:
# Especificar las características (X) y el objetivo (y)
x_train = df.loc[df["split"] == "train"].drop(["aprobo", "split"], axis=1)
y_train = df.loc[df["split"] == "train"]["aprobo"]

In [None]:
# Creamos un diccionario donde guardamos todo el conjunto de datos
datasets = {
    "train": (x_train, y_train),
}

#### Evaluamos distintos modelos

##### Funciones de entrenamiento y evaluación

In [None]:
def train_model(model: list, datasets: dict, run_params: dict = None):
    """
    Runs training for model.

    Args:
        model (list): Model class and Model parameters.
        datasets (dict): A dictionary containing the datasets for training.
        run_params (dict): Additional parameters for the experiment runs (default: None).

    Returns:
        None
    """
    x_train, y_train = datasets["train"]

    # Initialize the training
    model_name = model.__name__
    print("=" * 78)
    print(f"Starting experiment for model: {model_name}.")
    print("=" * 78)

    try:
        x_train_resample, y_train_resample = x_train.copy(), y_train.copy()
        print("Training the model...")

        # Apply sampling techniques if necessary
        if run_params["over_sampler"] == "random_over_sampler":
            over_sampler = RandomOverSampler(
                sampling_strategy=run_params["over_sampling_strategy"]
            )
            x_train_resample, y_train_resample = over_sampler.fit_resample(
                x_train_resample, y_train_resample
            )

        if run_params["under_sampler"] == "random_under_sampler":
            under_sampler = RandomUnderSampler(
                sampling_strategy=run_params["under_sampling_strategy"]
            )
            x_train_resample, y_train_resample = under_sampler.fit_resample(
                x_train_resample, y_train_resample
            )

        steps = [("scaler", StandardScaler()), ("model", model())]
        pipeline = Pipeline(steps=steps)

        # Train the model
        pipeline.fit(x_train_resample, y_train_resample)

        return pipeline

    except Exception as e:
        # Handle any exception and log the error
        print(e)

    finally:
        print("Training completed.")

##### Configuración de los parámetros de balanceo de datos

In [None]:
# Para remover el sampleo remplazar los string con None
params = {
    "over_sampler": "random_over_sampler",
    "over_sampling_strategy": 0.1,
    "under_sampler": "random_under_sampler",
    "under_sampling_strategy": 0.5,
    "threshold": 0.9998,
}

##### Entrenamos el modelo

In [None]:
model = train_model(XGBClassifier, datasets, params)

#### Guardamos el modelo

In [None]:
joblib.dump(model, "../models/model.pkl")