# 06 - Hyperparameter Tuning

Tuning com GridSearchCV/RandomizedSearchCV para modelos selecionados.



In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor

from huggingface_hub import hf_hub_download
from pathlib import Path

# helpers

def load_selected_features(df: pd.DataFrame) -> list:
    p = Path('reports/eda/selected_features.csv')
    if p.exists():
        try:
            feats = pd.read_csv(p).iloc[:,0].dropna().astype(str).tolist()
            feats = [c for c in feats if c in df.columns and c != 'quality']
            if feats:
                return feats
        except Exception:
            pass
    return [c for c in df.select_dtypes(include=[np.number]).columns if c != 'quality']


def load_processed_df():
    HF_TOKEN = os.getenv('HF_TOKEN', None)
    HF_PROCESSED_REPO = os.getenv('HF_PROCESSED_REPO', 'henriquebap/wine-ml-processed')
    try:
        df_path = hf_hub_download(repo_id=HF_PROCESSED_REPO, filename='processed/full.csv', repo_type='dataset', token=HF_TOKEN)
        df = pd.read_csv(df_path)
        print('✅ Dados processados carregados do HF Hub')
    except Exception as e:
        print('⚠️ Fallback para dados locais:', e)
        proc = Path.cwd().parent / 'data' / 'processed' / 'df_capped.csv'
        if proc.exists():
            df = pd.read_csv(proc)
        else:
            raise FileNotFoundError('df_capped.csv não encontrado localmente nem no Hub')
    return df


_df = load_processed_df()
FEATURES = load_selected_features(_df)
X = _df[FEATURES]
y = _df["quality"]

param_grid = {
    'n_estimators': [200, 400, 600],
    'max_depth': [None, 8, 12, 16],
    'min_samples_split': [2, 5, 10],
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
rf = RandomForestRegressor(random_state=42)

search = GridSearchCV(rf, param_grid, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=-1)
search.fit(X, y)

print("Best params:", search.best_params_)
print("Best RMSE:", -search.best_score_)


✅ Dados processados carregados do HF Hub
Best params: {'max_depth': 8, 'min_samples_split': 10, 'n_estimators': 200}
Best RMSE: 0.6285949224086554
