In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle
import mlflow
import mlflow.sklearn


from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error


In [2]:
df_features_train = pd.read_csv('../data/processed/df_features_train.csv')
dataset_test = pd.read_csv('../data/processed/dataset_test.csv')

In [3]:
# Definir X (características de entrada) y Y (variables de salida)
x_features_train = df_features_train.drop(columns=['C3S', 'C2S', 'C3A', 'C4AF', 'FCAO'])
y_train = df_features_train[['C3S', 'C2S', 'C3A', 'C4AF', 'FCAO']]

x_test = dataset_test.drop(columns=['C3S', 'C2S', 'C3A', 'C4AF', 'FCAO'])
y_test = dataset_test[['C3S', 'C2S', 'C3A', 'C4AF', 'FCAO']]

### Leemos pipeline pre-configurado

In [4]:
with open('../artifacts/pipeline.pkl', 'rb') as f:
    pipeline = pickle.load(f)

#pipeline de ingenieria de caracteristicas

### Utilizamos el pipelina para transformar los datos de test

In [5]:
x_features_test_array = pipeline.transform(x_test)
x_features_test = pd.DataFrame(x_features_test_array, columns=x_test.columns)
x_features_test.head()


Unnamed: 0,id_eq,SiO2,Al2O3,Fe2O3,CaO,MgO,SO3,K2O,Na2O,A/S,"""CaO"" [CurrentProduct.Dry basis]","""SiO2"" [CurrentProduct.Dry basis]","""Al2O3"" [CurrentProduct.Dry basis]","""Fe2O3"" [CurrentProduct.Dry basis]","""MgO"" [CurrentProduct.Dry basis]","""CaO"" [Rolling.Analysis1.Dry basis]","""SiO2"" [Rolling.Analysis1.Dry basis]","""Al2O3"" [Rolling.Analysis1.Dry basis]","""Fe2O3"" [Rolling.Analysis1.Dry basis]","""MgO"" [Rolling.Analysis1.Dry basis]"
0,-1.320663,1.024635,0.284848,-0.894663,-0.195042,-1.482623,1.873509,-2.425788,1.039542,-3.049749,1.795941,-0.702785,-1.584735,-0.300853,-1.735382,0.2888,1.653992,-0.037375,-0.069821,-0.342002
1,-1.320663,0.819017,-0.616614,1.238402,0.305528,-0.686919,-1.588097,-0.891051,-1.57484,0.917658,0.335988,0.750972,-0.52445,0.631381,-0.369419,-0.141514,-1.118303,-0.723483,-0.234774,0.184715
2,1.292064,1.360776,-0.67495,0.908002,1.16297,-0.666616,-1.290185,-1.226531,-0.318839,0.759434,-0.223134,1.373181,-0.052539,0.287335,-0.254278,-0.360557,0.129289,-0.164496,-0.113036,-0.948515
3,-0.0143,-0.621801,-0.050855,-0.410002,-0.463683,-0.619971,-0.21479,-0.900158,-0.480808,-0.555954,0.914565,-1.150162,-0.862417,-0.577119,-0.610437,-0.720456,0.141471,-0.571977,0.453896,-0.970987
4,-1.320663,-1.281721,0.266835,0.031127,0.15404,0.733229,0.654677,0.444711,-0.099463,-0.627158,0.593229,-0.105401,-0.792588,-0.406541,-0.136636,-1.004273,-1.201848,-2.660185,-1.276613,0.407818


### Configuramos hiperparámetros de los diferentes modelos a evaluar

In [6]:
# Configuraciones de hiperparámetros para cada modelo
models = {
    'LinearRegression': [
        {'fit_intercept': True},
        {'fit_intercept': False}
    ],
    'RandomForestRegressor': [
        {'n_estimators': 50, 'max_depth': 10},
        {'n_estimators': 100, 'max_depth': 20},
        {'n_estimators': 200, 'max_depth': None}
    ],
    'GradientBoostingRegressor': [
        {'n_estimators': 50, 'learning_rate': 0.1},
        {'n_estimators': 100, 'learning_rate': 0.05},
        {'n_estimators': 200, 'learning_rate': 0.01}
    ],
    'SVR': [
        {'kernel': 'linear', 'C': 1.0},
        {'kernel': 'rbf', 'C': 10.0},
        {'kernel': 'poly', 'degree': 2, 'C': 1.0}
    ],
    'KNeighborsRegressor': [
        {'n_neighbors': 5, 'weights': 'uniform'},
        {'n_neighbors': 10, 'weights': 'distance'},
        {'n_neighbors': 15, 'weights': 'uniform'}
    ]
}

In [39]:
# configuracion servidor

mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment("Model_Selection_Cement")

2024/12/21 17:45:16 INFO mlflow.tracking.fluent: Experiment with name 'Model_Selection_Cement' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/867666794650537236', creation_time=1734824716955, experiment_id='867666794650537236', last_update_time=1734824716955, lifecycle_stage='active', name='Model_Selection_Cement', tags={}>

### Entrenamos y evaluamos modelos

In [41]:
# Entrenar y evaluar cada modelo con sus configuraciones
results = []

for model_name, param_list in models.items():
    for i, params in enumerate(param_list, 1):
        with mlflow.start_run(run_name=f"{model_name}_Config_{i}"):
            if model_name == 'LinearRegression':
                model = MultiOutputRegressor(LinearRegression(**params))
            elif model_name == 'RandomForestRegressor':
                model = MultiOutputRegressor(RandomForestRegressor(**params, random_state=42))
            elif model_name == 'GradientBoostingRegressor':
                model = MultiOutputRegressor(GradientBoostingRegressor(**params, random_state=42))
            elif model_name == 'SVR':
                model = MultiOutputRegressor(SVR(**params))
            elif model_name == 'KNeighborsRegressor':
                model = MultiOutputRegressor(KNeighborsRegressor(**params))

            model.fit(x_features_train, y_train)
            
            Y_pred = model.predict(x_features_test)
            mse_scores = mean_squared_error(y_test, Y_pred, multioutput='raw_values')

            avg_mse = np.mean(mse_scores)
            
            # Log metrics to MLflow
            mlflow.log_params(params)
            mlflow.log_metric("MSE_C3S", mse_scores[0])
            mlflow.log_metric("MSE_C2S", mse_scores[1])
            mlflow.log_metric("MSE_C3A", mse_scores[2])
            mlflow.log_metric("MSE_C4AF", mse_scores[3])
            mlflow.log_metric("MSE_FCAO", mse_scores[4])
            mlflow.log_metric("MSE_Average", avg_mse)

            # Log the model
            mlflow.sklearn.log_model(model, artifact_path="model")

            results.append({
                'Model': model_name,
                'Configuration': f'Config {i}',
                'MSE_C3S': mse_scores[0],
                'MSE_C2S': mse_scores[1],
                'MSE_C3A': mse_scores[2],
                'MSE_C4AF': mse_scores[3],
                'MSE_FCAO': mse_scores[4],
                'MSE_Average': avg_mse
            })
mlflow.end_run()



🏃 View run LinearRegression_Config_1 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/6307ce0112b74f738cec3d8b523df22e
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run LinearRegression_Config_2 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/e9a03113dc3f48609b2a7658fd9ba3c4
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run RandomForestRegressor_Config_1 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/ae2805567363486d946db3ecf66879ae
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run RandomForestRegressor_Config_2 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/8bb283f5ca8f4a4593574ad6b6999ad9
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run RandomForestRegressor_Config_3 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/0b002b4f10164d9e98a092665e6790e3
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run GradientBoostingRegressor_Config_1 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/696caed1740343989cd7718840e8ae1e
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run GradientBoostingRegressor_Config_2 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/8041784967b5407588037da520ce5284
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run GradientBoostingRegressor_Config_3 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/cfdfcd74b1a84e188fc3f7bea31eb307
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run SVR_Config_1 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/a47fd1c6f6b141a0a044f1e498dedea2
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run SVR_Config_2 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/1d8728bc88fe44d39ec2e5a2f35348dc
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run SVR_Config_3 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/46399badf7604453890e852c78497025
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run KNeighborsRegressor_Config_1 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/93fb6803b8214058a0d6c4383c9439c3
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run KNeighborsRegressor_Config_2 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/9948eb6a947c4bc884e85d7108948731
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236




🏃 View run KNeighborsRegressor_Config_3 at: http://127.0.0.1:8080/#/experiments/867666794650537236/runs/120e04e664a447a3a852d780629089de
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867666794650537236


In [8]:
# Mostrar los resultados en un DataFrame
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Configuration,MSE_C3S,MSE_C2S,MSE_C3A,MSE_C4AF,MSE_FCAO
0,LinearRegression,Config 1,0.817115,0.035245,2.237837e-30,2.586036e-30,0.049304
1,LinearRegression,Config 2,4551.220869,14.129197,83.64672,53.60678,2.89296
2,RandomForestRegressor,Config 1,0.883739,0.073179,6.227218e-05,8.436066e-08,0.04363
3,RandomForestRegressor,Config 2,0.845493,0.06884,6.016443e-05,8.987681e-08,0.041849
4,RandomForestRegressor,Config 3,0.845409,0.068945,5.893081e-05,8.562689e-08,0.0418
5,GradientBoostingRegressor,Config 1,0.944363,0.098372,0.0002342579,2.170598e-06,0.04653
6,GradientBoostingRegressor,Config 2,0.941309,0.099944,0.0002391748,1.39982e-06,0.047081
7,GradientBoostingRegressor,Config 3,1.220193,0.445998,0.003228789,0.0003307596,0.055716
8,SVR,Config 1,0.826568,0.042689,0.001211702,0.001417207,0.049465
9,SVR,Config 2,0.631546,0.055103,0.002748868,0.001467534,0.039954


### Obtenemos mejor modelo y lo agregamos al pipeline

In [9]:
# Calcular el MSE promedio y agregarlo al DataFrame
results_df['MSE_Average'] = results_df[['MSE_C3S', 'MSE_C2S', 'MSE_C3A', 'MSE_C4AF', 'MSE_FCAO']].mean(axis=1)

# Obtener la fila con el menor MSE promedio
best_result = results_df.loc[results_df['MSE_Average'].idxmin()]

# Extraer información del mejor modelo
best_model_name = best_result['Model']
best_config_index = int(best_result['Configuration'].split(' ')[-1]) - 1
best_config = models[best_model_name][best_config_index]

# Mostrar el mejor modelo y configuración
print(f"Mejor modelo: {best_model_name}")
print(f"Mejor configuración: {best_config}")
print(f"MSE promedio: {best_result['MSE_Average']}")

Mejor modelo: SVR
Mejor configuración: {'kernel': 'rbf', 'C': 10.0}
MSE promedio: 0.14616387278875542


In [10]:
# Instanciar el mejor modelo con su configuración ganadora
if best_model_name == 'LinearRegression':
    best_model = MultiOutputRegressor(LinearRegression(**best_config))
elif best_model_name == 'RandomForestRegressor':
    best_model = MultiOutputRegressor(RandomForestRegressor(**best_config, random_state=42))
elif best_model_name == 'GradientBoostingRegressor':
    best_model = MultiOutputRegressor(GradientBoostingRegressor(**best_config, random_state=42))
elif best_model_name == 'SVR':
    best_model = MultiOutputRegressor(SVR(**best_config))
elif best_model_name == 'KNeighborsRegressor':
    best_model = MultiOutputRegressor(KNeighborsRegressor(**best_config))

In [16]:
best_model

In [17]:
pipeline.steps.append(
    ('modelo_regresion', best_model)
)

pipeline

### Cargamos nuevamente toda la data train para entrenar el modelo

In [26]:
dataset_train = pd.read_csv('../data/raw/dataset_train.csv')
dataset_train.drop(['Unnamed: 0'], axis=1, inplace=True)
dataset_train_features = dataset_train.drop(columns=['C3S', 'C2S', 'C3A', 'C4AF', 'FCAO'], axis=1)
dataset_train_target = dataset_train[['C3S', 'C2S', 'C3A', 'C4AF', 'FCAO']]

In [30]:
pipeline.fit(dataset_train_features,dataset_train_target)

In [42]:
# Guardamos el pipeline

import pickle

with open('../artifacts/pipeline_trained.pkl', 'wb') as f:
    pickle.dump(pipeline, f)