In [1]:
%cd ..

/Users/paolaalejandraleonguarneros/Documents/GitHub/MLOps


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor, callback
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib
import os
import xgboost as xgb
from sklearn.model_selection import GridSearchCV


In [3]:
df = pd.read_csv("data/processed/train.csv")


In [4]:
df.head()

Unnamed: 0,yr,hr,holiday,workingday,temp,atemp,hum,windspeed,cnt,season_2,...,mnth_12,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,weathersit_2,weathersit_3,weathersit_4
0,1.0,14.0,0,1.0,0.458333,0.4545,0.38,0.228047,0.204341,False,...,False,False,False,False,False,True,False,False,False,False
1,0.0,4.0,0,0.0,0.625,0.6061,0.65,0.193018,0.008234,True,...,False,False,False,False,False,False,False,True,False,False
2,0.0,15.0,0,0.0,0.520833,0.5,0.68,0.228047,0.261976,False,...,False,False,False,False,False,False,False,True,False,False
3,1.0,18.0,0,1.0,0.1875,0.2273,0.86,0.105325,0.122006,False,...,False,True,False,False,False,False,False,False,False,True
4,0.0,18.0,0,1.0,0.666667,0.6364,0.39,0.193018,0.247006,False,...,False,False,False,False,False,True,False,False,False,False


In [5]:
# Entrena un modelo XGBoost para predecir 'cnt'

# Features / target
X = df.drop(columns=['cnt']).copy()
y = df['cnt'].copy()

# Convertir booleanos a enteros
bool_cols = X.select_dtypes(include='bool').columns
X[bool_cols] = X[bool_cols].astype(int)


In [6]:
# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Imputación y escalado
imputer = SimpleImputer(strategy='median')

X_train_imp = imputer.fit_transform(X_train)
X_test_imp = imputer.transform(X_test)

In [None]:
param_grid = {
    'max_depth': [5,7,9],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100],
    'min_child_weight': [1, 3, 5],
    'gamma': [0, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.6,0.7,0.8]
}

# Create XGBoost regressor
xgb_reg = xgb.XGBRegressor(
    objective='reg:squarederror',
    random_state=42
)

# Create GridSearchCV object
grid_search = GridSearchCV(
    estimator=xgb_reg,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_squared_error',
    verbose=2,
    n_jobs=-1
)

# Fit the grid search
grid_search.fit(X_train, y_train)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", np.sqrt(-grid_search.best_score_))

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_model.predict(X)

# Print metrics
#print("R2 Score:", r2_score(y, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y, y_pred)))

Fitting 5 folds for each of 729 candidates, totalling 3645 fits
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_depth=5, min_child_weight=1, n_estimators=100, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.6, gamma=0, learning_rate=0.01, max_de

In [10]:
# Use best model from grid search to predict
y_pred_test = best_model.predict(X_test_imp)

# Calculate metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
mae = mean_absolute_error(y_test, y_pred_test)

print(f"Test Set Metrics:")
print(f"RMSE: {rmse:.6f}")
print(f"MAE:  {mae:.6f}")

Test Set Metrics:
RMSE: 0.046920
MAE:  0.026440


In [None]:
# Modelo XGBoost
model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    verbosity=1
)

# Entrenar con early stopping usando una porción de validación tomada del conjunto de entrenamiento
# (aquí XGB usa directamente el eval_set)
# Entrenar usando callbacks para early stopping
model.fit(
    X_train_imp,
    y_train,
    eval_set=[(X_test_imp, y_test)],
    verbose=False,
)

# Predicción y métricas
y_pred = model.predict(X_test_imp)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE (test): {rmse:.4f}")
print(f"R2   (test): {r2:.4f}")
print(f"Best n_estimators: {model.best_iteration if hasattr(model, 'best_iteration') else 'N/A'}")

# Guardar artefactos (modelo + preprocesadores)
os.makedirs('models', exist_ok=True)
joblib.dump({
    'model': model,
    'imputer': imputer,
    'feature_columns': X.columns.tolist()
}, 'models/xgb_cnt_pipeline.joblib')

print("Modelo y pipeline guardados en models/xgb_cnt_pipeline.joblib")