# 07 - Model Evaluation

Avaliação final com hold-out, métricas (RMSE, MAE, R2), curva de resíduos e feature importance.



In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor

import sys
import os

# Adiciona o diretório raiz do projeto ao PYTHONPATH
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.data_ingestion import load_wine_dataframe
from src.data_processing import DataPreprocessor

HF_REPO = os.getenv("HF_DATASET_REPO", "henriquebap/wine-ml-dataset")
FILENAME = os.getenv("HF_DATASET_FILENAME", "WineQT.csv")

FEATURES = [
    "fixed acidity","volatile acidity","citric acid","residual sugar","chlorides",
    "free sulfur dioxide","total sulfur dioxide","density","pH","sulphates","alcohol"
]

df = load_wine_dataframe(repo_id=HF_REPO, filename=FILENAME)
pre = DataPreprocessor(feature_columns=FEATURES, target_column="quality")
df_p = pre.fit_transform(df)

X = df_p[FEATURES]
y = df_p["quality"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=400, random_state=42)
model.fit(X_train, y_train)

pred = model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared=False)
mae = mean_absolute_error(y_test, pred)
r2 = r2_score(y_test, pred)

print({"rmse": rmse, "mae": mae, "r2": r2})

# Residuals plot
resid = y_test - pred
sns.scatterplot(x=pred, y=resid)
plt.axhline(0, color='red', linestyle='--')
plt.title('Residuals vs Predictions')
plt.show()

# Feature importance
importances = pd.Series(model.feature_importances_, index=FEATURES).sort_values(ascending=False)
importances
