# 04 - Baseline Models

Comparação rápida de modelos baseline (LinearRegression, RandomForest, XGB opcional) com validação cruzada.



In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

from src.data_ingestion import load_wine_dataframe
from src.data_processing import DataPreprocessor

HF_REPO = os.getenv("HF_DATASET_REPO", "henriquebap/wine-ml-dataset")
FILENAME = os.getenv("HF_DATASET_FILENAME", "WineQT.csv")

FEATURES = [
    "fixed acidity","volatile acidity","citric acid","residual sugar","chlorides",
    "free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"
]

df = load_wine_dataframe(repo_id=HF_REPO, filename=FILENAME)
pre = DataPreprocessor(feature_columns=FEATURES, target_column="quality")
df_p = pre.fit_transform(df)
X = df_p[FEATURES]
y = df_p["quality"]

models = {
    "Linear": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "RF": RandomForestRegressor(n_estimators=300, random_state=42),
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
results = []
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=cv, scoring='neg_root_mean_squared_error')
    results.append({
        "model": name,
        "rmse_mean": -scores.mean(),
        "rmse_std": scores.std(),
    })

pd.DataFrame(results).sort_values("rmse_mean")
