# Taller 3 MLOps - Data Science Salaries 2023
## Fase 3: Modelación con Optuna + MLFlow

### Universidad EIA
Entrenamiento de 3 modelos, optimización de hiperparámetros y tracking

## 1. IMPORTAR LIBRERÍAS

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import optuna
from optuna.pruners import MedianPruner
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import joblib
import warnings
warnings.filterwarnings('ignore')

print("✓ Librerías importadas correctamente")

✓ Librerías importadas correctamente


## 2. CARGAR DATOS PROCESADOS

In [2]:
# Cargar datos procesados
X_train = pd.read_csv('../data/processed/X_train.csv')
X_val = pd.read_csv('../data/processed/X_val.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')

y_train = pd.read_csv('../data/processed/y_train.csv').values.flatten()
y_val = pd.read_csv('../data/processed/y_val.csv').values.flatten()
y_test = pd.read_csv('../data/processed/y_test.csv').values.flatten()

print(f"X_train: {X_train.shape}")
print(f"y_train: {y_train.shape}")
print(f"\nX_val: {X_val.shape}")
print(f"X_test: {X_test.shape}")

X_train: (2253, 157)
y_train: (2253,)

X_val: (751, 157)
X_test: (751, 157)


## 3. CONFIGURAR MLFLOW

In [3]:
# Inicializar MLFlow
mlflow.set_experiment("data_science_salaries")

print("✓ MLFlow configurado")
print("\nPara ver el dashboard, ejecuta en terminal: mlflow ui")
print("Luego abre: http://localhost:5000")

2025/11/27 02:16:37 INFO mlflow.tracking.fluent: Experiment with name 'data_science_salaries' does not exist. Creating a new experiment.


✓ MLFlow configurado

Para ver el dashboard, ejecuta en terminal: mlflow ui
Luego abre: http://localhost:5000


## 4. MODELO 1: LINEAR REGRESSION (BASELINE)

In [4]:
print("\n" + "="*70)
print("MODELO 1: LINEAR REGRESSION")
print("="*70)

with mlflow.start_run(run_name="linear_regression"):
    # Crear modelo
    lr_model = LinearRegression()
    
    # Entrenar
    lr_model.fit(X_train, y_train)
    
    # Predicciones
    y_val_pred = lr_model.predict(X_val)
    y_test_pred = lr_model.predict(X_test)
    
    # Métricas
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    val_mae = mean_absolute_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)
    
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    
    # Registrar en MLFlow
    mlflow.log_param("model", "LinearRegression")
    mlflow.log_metric("val_rmse", val_rmse)
    mlflow.log_metric("val_mae", val_mae)
    mlflow.log_metric("val_r2", val_r2)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("test_mae", test_mae)
    mlflow.log_metric("test_r2", test_r2)
    
    # Guardar modelo
    mlflow.sklearn.log_model(lr_model, "linear_regression_model")
    joblib.dump(lr_model, '../models/linear_regression.pkl')
    
    print(f"\nValidation:")
    print(f"  RMSE: ${val_rmse:,.2f}")
    print(f"  MAE:  ${val_mae:,.2f}")
    print(f"  R²:   {val_r2:.4f}")
    
    print(f"\nTest:")
    print(f"  RMSE: ${test_rmse:,.2f}")
    print(f"  MAE:  ${test_mae:,.2f}")
    print(f"  R²:   {test_r2:.4f}")
    print(f"\n✓ Linear Regression completado y guardado")


MODELO 1: LINEAR REGRESSION

Validation:
  RMSE: $6,195,204,635,002,591.00
  MAE:  $535,001,595,065,956.50
  R²:   -114548441068187648000.0000

Test:
  RMSE: $6,546,359,475,646,164.00
  MAE:  $643,134,598,509,542.25
  R²:   -262507810408744419328.0000

✓ Linear Regression completado y guardado


## 5. MODELO 2: RANDOM FOREST CON OPTUNA

In [5]:
print("\n" + "="*70)
print("MODELO 2: RANDOM FOREST CON OPTUNA")
print("="*70)

# Definir objetivo de Optuna
def objective_rf(trial):
    # Sugerir hiperparámetros
    n_estimators = trial.suggest_int('n_estimators', 100, 300, step=50)
    max_depth = trial.suggest_int('max_depth', 10, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)
    
    # Crear modelo
    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1
    )
    
    # Entrenar
    rf_model.fit(X_train, y_train)
    
    # Predecir en validación
    y_val_pred = rf_model.predict(X_val)
    
    # Calcular RMSE (queremos minimizarlo)
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    
    return rmse

# Crear estudio de Optuna
print("\nBuscando hiperparámetros óptimos (50 trials)...")
sampler_rf = optuna.samplers.TPESampler(seed=42)
pruner = MedianPruner()

study_rf = optuna.create_study(
    direction='minimize',
    sampler=sampler_rf,
    pruner=pruner
)

# Optimizar
study_rf.optimize(objective_rf, n_trials=50, show_progress_bar=True)

# Mejores parámetros
best_params_rf = study_rf.best_params
print(f"\nMejores parámetros encontrados:")
print(best_params_rf)

# Entrenar modelo final con mejores parámetros
rf_model_final = RandomForestRegressor(
    **best_params_rf,
    random_state=42,
    n_jobs=-1
)

rf_model_final.fit(X_train, y_train)

# Predicciones
y_val_pred_rf = rf_model_final.predict(X_val)
y_test_pred_rf = rf_model_final.predict(X_test)

# Métricas
val_rmse_rf = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))
val_mae_rf = mean_absolute_error(y_val, y_val_pred_rf)
val_r2_rf = r2_score(y_val, y_val_pred_rf)

test_rmse_rf = np.sqrt(mean_squared_error(y_test, y_test_pred_rf))
test_mae_rf = mean_absolute_error(y_test, y_test_pred_rf)
test_r2_rf = r2_score(y_test, y_test_pred_rf)

# Registrar en MLFlow
with mlflow.start_run(run_name="random_forest"):
    mlflow.log_param("model", "RandomForest")
    mlflow.log_params(best_params_rf)
    mlflow.log_metric("val_rmse", val_rmse_rf)
    mlflow.log_metric("val_mae", val_mae_rf)
    mlflow.log_metric("val_r2", val_r2_rf)
    mlflow.log_metric("test_rmse", test_rmse_rf)
    mlflow.log_metric("test_mae", test_mae_rf)
    mlflow.log_metric("test_r2", test_r2_rf)
    mlflow.sklearn.log_model(rf_model_final, "random_forest_model")
    joblib.dump(rf_model_final, '../models/random_forest.pkl')

print(f"\nValidation:")
print(f"  RMSE: ${val_rmse_rf:,.2f}")
print(f"  MAE:  ${val_mae_rf:,.2f}")
print(f"  R²:   {val_r2_rf:.4f}")

print(f"\nTest:")
print(f"  RMSE: ${test_rmse_rf:,.2f}")
print(f"  MAE:  ${test_mae_rf:,.2f}")
print(f"  R²:   {test_r2_rf:.4f}")
print(f"\n✓ Random Forest completado y guardado")

[I 2025-11-27 02:16:41,177] A new study created in memory with name: no-name-1204d2a0-6121-4a58-8048-0f3b4e0edd0e



MODELO 2: RANDOM FOREST CON OPTUNA

Buscando hiperparámetros óptimos (50 trials)...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-27 02:16:41,766] Trial 0 finished with value: 557746.0755877099 and parameters: {'n_estimators': 150, 'max_depth': 29, 'min_samples_split': 8, 'min_samples_leaf': 3}. Best is trial 0 with value: 557746.0755877099.
[I 2025-11-27 02:16:42,324] Trial 1 finished with value: 541318.5129312613 and parameters: {'n_estimators': 100, 'max_depth': 13, 'min_samples_split': 2, 'min_samples_leaf': 5}. Best is trial 1 with value: 541318.5129312613.
[I 2025-11-27 02:16:43,340] Trial 2 finished with value: 538071.9817192073 and parameters: {'n_estimators': 250, 'max_depth': 24, 'min_samples_split': 2, 'min_samples_leaf': 5}. Best is trial 2 with value: 538071.9817192073.
[I 2025-11-27 02:16:44,417] Trial 3 finished with value: 459762.4248162053 and parameters: {'n_estimators': 300, 'max_depth': 14, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 3 with value: 459762.4248162053.
[I 2025-11-27 02:16:45,028] Trial 4 finished with value: 611671.3484668345 and parameters: {'n_estim

## 6. MODELO 3: XGBOOST CON OPTUNA

In [None]:
print("\n" + "="*70)
print("MODELO 3: XGBOOST CON OPTUNA")
print("="*70)

# Definir objetivo de Optuna
def objective_xgb(trial):
    # Sugerir hiperparámetros
    n_estimators = trial.suggest_int('n_estimators', 100, 300, step=50)
    max_depth = trial.suggest_int('max_depth', 3, 10)
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    
    # Crear modelo
    xgb_model = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        random_state=42,
        n_jobs=-1,
        verbosity=0
    )
    
    # Entrenar
    xgb_model.fit(X_train, y_train)
    
    # Predecir en validación
    y_val_pred = xgb_model.predict(X_val)
    
    # Calcular RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    
    return rmse

# Crear estudio de Optuna
print("\nBuscando hiperparámetros óptimos (50 trials)...")
sampler_xgb = optuna.samplers.TPESampler(seed=42)
pruner = MedianPruner()

study_xgb = optuna.create_study(
    direction='minimize',
    sampler=sampler_xgb,
    pruner=pruner
)

# Optimizar
study_xgb.optimize(objective_xgb, n_trials=50, show_progress_bar=True)

# Mejores parámetros
best_params_xgb = study_xgb.best_params
print(f"\nMejores parámetros encontrados:")
print(best_params_xgb)

# Entrenar modelo final con mejores parámetros
xgb_model_final = XGBRegressor(
    **best_params_xgb,
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

xgb_model_final.fit(X_train, y_train)

# Predicciones
y_val_pred_xgb = xgb_model_final.predict(X_val)
y_test_pred_xgb = xgb_model_final.predict(X_test)

# Métricas
val_rmse_xgb = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))
val_mae_xgb = mean_absolute_error(y_val, y_val_pred_xgb)
val_r2_xgb = r2_score(y_val, y_val_pred_xgb)

test_rmse_xgb = np.sqrt(mean_squared_error(y_test, y_test_pred_xgb))
test_mae_xgb = mean_absolute_error(y_test, y_test_pred_xgb)
test_r2_xgb = r2_score(y_test, y_test_pred_xgb)

# Registrar en MLFlow
with mlflow.start_run(run_name="xgboost"):
    mlflow.log_param("model", "XGBoost")
    mlflow.log_params(best_params_xgb)
    mlflow.log_metric("val_rmse", val_rmse_xgb)
    mlflow.log_metric("val_mae", val_mae_xgb)
    mlflow.log_metric("val_r2", val_r2_xgb)
    mlflow.log_metric("test_rmse", test_rmse_xgb)
    mlflow.log_metric("test_mae", test_mae_xgb)
    mlflow.log_metric("test_r2", test_r2_xgb)
    mlflow.xgboost.log_model(xgb_model_final, "xgboost_model")
    joblib.dump(xgb_model_final, '../models/xgboost.pkl')

print(f"\nValidation:")
print(f"  RMSE: ${val_rmse_xgb:,.2f}")
print(f"  MAE:  ${val_mae_xgb:,.2f}")
print(f"  R²:   {val_r2_xgb:.4f}")

print(f"\nTest:")
print(f"  RMSE: ${test_rmse_xgb:,.2f}")
print(f"  MAE:  ${test_mae_xgb:,.2f}")
print(f"  R²:   {test_r2_xgb:.4f}")
print(f"\n✓ XGBoost completado y guardado")

[I 2025-11-27 02:17:32,799] A new study created in memory with name: no-name-6a324be8-6b69-4d9b-80d0-c54e7f67b76b



MODELO 3: XGBOOST CON OPTUNA

Buscando hiperparámetros óptimos (50 trials)...


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-11-27 02:18:04,919] Trial 0 finished with value: 365276.8017567039 and parameters: {'n_estimators': 150, 'max_depth': 10, 'learning_rate': 0.1205712628744377, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182}. Best is trial 0 with value: 365276.8017567039.
[I 2025-11-27 02:18:20,728] Trial 1 finished with value: 375245.08609656006 and parameters: {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.19030368381735815, 'subsample': 0.8005575058716043, 'colsample_bytree': 0.8540362888980227}. Best is trial 0 with value: 365276.8017567039.


## 7. COMPARACIÓN DE MODELOS

In [None]:
# Crear tabla comparativa
comparison_df = pd.DataFrame({
    'Modelo': ['Linear Regression', 'Random Forest', 'XGBoost'],
    'Val RMSE': [val_rmse, val_rmse_rf, val_rmse_xgb],
    'Test RMSE': [test_rmse, test_rmse_rf, test_rmse_xgb],
    'Val MAE': [val_mae, val_mae_rf, val_mae_xgb],
    'Test MAE': [test_mae, test_mae_rf, test_mae_xgb],
    'Val R²': [val_r2, val_r2_rf, val_r2_xgb],
    'Test R²': [test_r2, test_r2_rf, test_r2_xgb]
})

print("\n" + "="*70)
print("COMPARACIÓN DE MODELOS")
print("="*70)
print(comparison_df.to_string(index=False))

# Guardar tabla
comparison_df.to_csv('../data/model_comparison.csv', index=False)
print("\n✓ Comparación guardada en ../data/model_comparison.csv")

## 8. RESUMEN

In [None]:
print("\n" + "="*70)
print("RESUMEN - FASE 3: MODELACIÓN")
print("="*70)

best_model_idx = comparison_df['Test RMSE'].idxmin()
best_model_name = comparison_df.loc[best_model_idx, 'Modelo']
best_rmse = comparison_df.loc[best_model_idx, 'Test RMSE']

print(f"""
✓ MODELOS ENTRENADOS: 3
  1. Linear Regression (baseline)
  2. Random Forest (ensemble con 50 trials de Optuna)
  3. XGBoost (ensemble con 50 trials de Optuna)

✓ MEJOR MODELO: {best_model_name}
  Test RMSE: ${best_rmse:,.2f}

✓ TRACKING: MLFlow
  Ejecuta: mlflow ui
  Dashboard: http://localhost:5000

✓ MODELOS GUARDADOS:
  linear_regression.pkl
  random_forest.pkl
  xgboost.pkl

✓ PRÓXIMO PASO:
  Ejecutar Notebook 04: Evaluación
""")

print("="*70)