In [32]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import json
import pickle
from pickle import dump
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error


In [26]:
X_train = pd.read_excel("../data/processed/X_train.xlsx")
y_train = pd.read_excel("../data/processed/y_train.xlsx")

X_test = pd.read_excel("../data/processed/X_test.xlsx")
y_test = pd.read_excel("../data/processed/y_test.xlsx")

In [27]:
xgb_model = XGBRegressor(
    random_state=42,
    colsample_bytree=0.6,
    learning_rate=0.05,
    max_depth=10,
    n_estimators=300,
    reg_alpha=1,
    reg_lambda=1,
    subsample=1.0
)


xgb_model.fit(X_train, y_train)

In [28]:
y_pred_train = xgb_model.predict(X_train)
y_pred_test = xgb_model.predict(X_test)

In [35]:
rmse_train = root_mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)
mae_train = mean_absolute_error(y_train, y_pred_train)

print(f"RMSE: {rmse_train:.2f}")
print(f"R² Score: {r2_train:.2f}")
print(f"MAE (train): {mae_train}")

RMSE: 28479.97
R² Score: 0.54
MAE (train): 21237.072265625


In [37]:
rmse_test =root_mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)
mae_test = mean_absolute_error(y_test, y_pred_test)

print(f"RMSE: {rmse_test:.2f}")
print(f"R² Score: {r2_test:.2f}")
print(f"MAE (test): {mae_test}")

RMSE: 32796.66
R² Score: 0.40
MAE (test): 24512.94140625


In [36]:
param_grid = {
    'n_estimators': [100, 300, 500],                # número de árboles
    'max_depth': [3, 5, 7, 10],                     # profundidad máxima del árbol
    'learning_rate': [0.01, 0.05, 0.1, 0.2],        # tasa de aprendizaje
    'subsample': [0.6, 0.8, 1.0],                   # fracción de datos usados por árbol
    'colsample_bytree': [0.6, 0.8, 1.0],            # fracción de columnas usadas por árbol
    'reg_alpha': [0, 0.1, 1],                       # regularización L1
    'reg_lambda': [1, 5, 10]                        # regularización L2
}

In [16]:
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 3888 candidates, totalling 19440 fits


In [17]:
best_model = grid_search.best_estimator_  # o random_search.best_estimator_

print("Mejores hiperparámetros:")
print(grid_search.best_params_)  # o random_search.best_params_

print(f"Mejor RMSE (CV): {-grid_search.best_score_:.2f}")


Mejores hiperparámetros:
{'colsample_bytree': 0.6, 'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 300, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 1.0}
Mejor RMSE (CV): 34317.03


In [18]:
y_pred_train = best_model.predict(X_train)
y_pred_test = best_model.predict(X_test)

In [None]:
rmse_train = root_mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

print(f"RMSE: {rmse_train:.2f}")
print(f"R² Score: {r2_train:.2f}")

# MAE para el conjunto de entrenamiento
mae_train = mean_absolute_error(y_train, y_pred_train)
print(f"MAE (train): {mae_train}")



RMSE: 28479.97
R² Score: 0.54
MAE (train): 21237.072265625


In [34]:
rmse_test =root_mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print(f"RMSE: {rmse_test:.2f}")
print(f"R² Score: {r2_test:.2f}")
# MAE para el conjunto de prueba
mae_test = mean_absolute_error(y_test, y_pred_test)
print(f"MAE (test): {mae_test}")

RMSE: 32796.66
R² Score: 0.40
MAE (test): 24512.94140625


## Con feature selection

In [21]:
train_data = pd.read_csv("../data/processed/clean_train_k_9.csv")
test_data = pd.read_csv("../data/processed/clean_test_k_9.csv")

X_train = train_data.drop(["salary_avg"], axis = 1)
y_train = train_data["salary_avg"]
X_test = test_data.drop(["salary_avg"], axis = 1)
y_test = test_data["salary_avg"]

In [22]:
xgb_model_k_9 = XGBRegressor(random_state=42)

xgb_model_k_9.fit(X_train, y_train)

In [23]:
y_pred_train = xgb_model_k_9.predict(X_train)
y_pred_test = xgb_model_k_9.predict(X_test)

In [24]:
rmse_train = root_mean_squared_error(y_train, y_pred_train)
r2_train = r2_score(y_train, y_pred_train)

print(f"RMSE: {rmse_train:.2f}")
print(f"R² Score: {r2_train:.2f}")

RMSE: 38417.65
R² Score: 0.19


In [25]:
rmse_test =root_mean_squared_error(y_test, y_pred_test)
r2_test = r2_score(y_test, y_pred_test)

print(f"RMSE: {rmse_test:.2f}")
print(f"R² Score: {r2_test:.2f}")

RMSE: 39728.45
R² Score: 0.13
