In [27]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [28]:
# Cargar el dataset (asegúrate de haber cargado tu archivo previamente)
dataset = pd.read_csv('../data/processed/features_for_model.csv')


In [29]:
# Definir X (características de entrada) y Y (variables de salida)
X = dataset.drop(columns=['C3S', 'C2S', 'C3A', 'C4AF', 'FCAO', 'Unnamed: 0'])
Y = dataset[['C3S', 'C2S', 'C3A', 'C4AF', 'FCAO']]

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Configuramos y calculamos el StandarScaler

In [30]:
std_scaler = StandardScaler()
std_scaler.fit(X_train)

Guardamos el Scaler configurado (con datos de train) como artefacto del modelo

In [31]:
import pickle

with open('../artifacts/std_scaler.pkl', 'wb') as f:
    pickle.dump(std_scaler, f)

# Configuramos modelo

Primero debemos escalar los datos de X_train y X_test

In [32]:
X_train_std = std_scaler.transform(X_train)
X_test_std = std_scaler.transform(X_test)

Configuramos los hiperparámetros para cada modelo

In [33]:
# Configuraciones de hiperparámetros para cada modelo
models = {
    'LinearRegression': [
        {'fit_intercept': True},
        {'fit_intercept': False}
    ],
    'RandomForestRegressor': [
        {'n_estimators': 50, 'max_depth': 10},
        {'n_estimators': 100, 'max_depth': 20},
        {'n_estimators': 200, 'max_depth': None}
    ],
    'GradientBoostingRegressor': [
        {'n_estimators': 50, 'learning_rate': 0.1},
        {'n_estimators': 100, 'learning_rate': 0.05},
        {'n_estimators': 200, 'learning_rate': 0.01}
    ],
    'SVR': [
        {'kernel': 'linear', 'C': 1.0},
        {'kernel': 'rbf', 'C': 10.0},
        {'kernel': 'poly', 'degree': 2, 'C': 1.0}
    ],
    'KNeighborsRegressor': [
        {'n_neighbors': 5, 'weights': 'uniform'},
        {'n_neighbors': 10, 'weights': 'distance'},
        {'n_neighbors': 15, 'weights': 'uniform'}
    ]
}

In [34]:
# Entrenar y evaluar cada modelo con sus configuraciones
results = []
for model_name, param_list in models.items():
    for i, params in enumerate(param_list, 1):
        if model_name == 'LinearRegression':
            model = MultiOutputRegressor(LinearRegression(**params))
        elif model_name == 'RandomForestRegressor':
            model = MultiOutputRegressor(RandomForestRegressor(**params, random_state=42))
        elif model_name == 'GradientBoostingRegressor':
            model = MultiOutputRegressor(GradientBoostingRegressor(**params, random_state=42))
        elif model_name == 'SVR':
            model = MultiOutputRegressor(SVR(**params))
        elif model_name == 'KNeighborsRegressor':
            model = MultiOutputRegressor(KNeighborsRegressor(**params))
        
        # Entrenar el modelo
        model.fit(X_train_std, Y_train)
        
        # Realizar predicciones
        Y_pred = model.predict(X_test_std)
        
        # Calcular el MSE para cada salida
        mse_scores = mean_squared_error(Y_test, Y_pred, multioutput='raw_values')
        
        # Almacenar resultados
        results.append({
            'Model': model_name,
            'Configuration': f'Config {i}',
            'MSE_C3S': mse_scores[0],
            'MSE_C2S': mse_scores[1],
            'MSE_C3A': mse_scores[2],
            'MSE_C4AF': mse_scores[3],
            'MSE_FCAO': mse_scores[4]
        })

In [35]:
# Mostrar los resultados en un DataFrame
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Configuration,MSE_C3S,MSE_C2S,MSE_C3A,MSE_C4AF,MSE_FCAO
0,LinearRegression,Config 1,0.785017,0.035172,2.4906469999999997e-30,2.5497529999999998e-30,0.047367
1,LinearRegression,Config 2,4551.137981,14.128992,83.64672,53.60678,2.891335
2,RandomForestRegressor,Config 1,0.88299,0.074174,6.183331e-05,8.448076e-08,0.04313
3,RandomForestRegressor,Config 2,0.844301,0.069177,5.951365e-05,9.071345e-08,0.04154
4,RandomForestRegressor,Config 3,0.840577,0.068619,5.863894e-05,8.519251e-08,0.041593
5,GradientBoostingRegressor,Config 1,0.9214,0.098372,0.0002342579,2.170598e-06,0.046306
6,GradientBoostingRegressor,Config 2,0.933207,0.100009,0.0002391748,1.39982e-06,0.046098
7,GradientBoostingRegressor,Config 3,1.219883,0.445998,0.003228789,0.0003307596,0.054144
8,SVR,Config 1,0.795093,0.042666,0.00116385,0.001489629,0.047584
9,SVR,Config 2,0.640926,0.052819,0.002715599,0.001481676,0.038706


# Seleccionamos el mejor modelo

EL mejor modelo que tiene un MSE promedio de todas las variables es el SVR config 2, con un MSE_promedio = 0.1473

In [36]:
# Crear y entrenar el modelo
modelo_SVR = MultiOutputRegressor(SVR(kernel='rbf', C= 10.0))
modelo_SVR.fit(X_train_std, Y_train)  # Entrenamos modelo

Realizamos las predicciones del modelo

In [37]:
# Realizar predicciones
Y_pred = modelo_SVR.predict(X_test_std)
        
# Calcular el MSE para cada salida
mse = mean_squared_error(Y_test, Y_pred, multioutput='raw_values')

Evaluamos nuevamente su MSE

In [38]:
mse

array([0.64092624, 0.05281935, 0.0027156 , 0.00148168, 0.03870597])

Guardamos modelo para producción

In [39]:
import pickle

with open('../models/SVR_v1.pkl', 'wb') as f:
    pickle.dump(modelo_SVR, f)