# 05 - Advanced Models

Modelos avançados (e.g., Gradient Boosting, XGBoost/LightGBM se disponível) e comparação.



In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.ensemble import GradientBoostingRegressor

from src.data_ingestion import load_wine_dataframe
from src.data_processing import DataPreprocessor

# Optional imports if available
try:
    from xgboost import XGBRegressor
    has_xgb = True
except Exception:
    has_xgb = False

HF_REPO = os.getenv("HF_DATASET_REPO", "henriquebap/wine-ml-dataset")
FILENAME = os.getenv("HF_DATASET_FILENAME", "WineQT.csv")

FEATURES = [
    "fixed acidity","volatile acidity","citric acid","residual sugar","chlorides",
    "free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"
]

df = load_wine_dataframe(repo_id=HF_REPO, filename=FILENAME)
pre = DataPreprocessor(feature_columns=FEATURES, target_column="quality")
df_p = pre.fit_transform(df)
X = df_p[FEATURES]
y = df_p["quality"]

models = {
    "GBR": GradientBoostingRegressor(random_state=42),
}
if has_xgb:
    models["XGB"] = XGBRegressor(n_estimators=400, max_depth=6, learning_rate=0.05, subsample=0.9, colsample_bytree=0.8, random_state=42)

cv = KFold(n_splits=5, shuffle=True, random_state=42)
results = []
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_root_mean_squared_error')
    results.append({
        "model": name,
        "rmse_mean": -scores.mean(),
        "rmse_std": scores.std(),
    })

pd.DataFrame(results).sort_values("rmse_mean")
