# Wine Quality (Red) — Analysis Notebook

Este notebook executa todo o fluxo pedido no trabalho: EDA, pré-processamento, treinamento e avaliação de modelos (Random Forest e SVM). Salve `winequality-red.csv` na pasta `data/` ou ajuste o caminho em `DATA_PATH`.

In [None]:
# Setup: imports e configurações iniciais
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, classification_report, confusion_matrix)
import joblib

RANDOM_STATE = 42
DATA_PATH = Path('data/winequality-red.csv')  # ajuste se necessário
OUTPUT_DIR = Path('outputs_notebook')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print('Configured. DATA_PATH =', DATA_PATH)

In [None]:
# Carregar dados
df = pd.read_csv(DATA_PATH, sep=';')
df.head()

In [None]:
# Informações básicas
print('Shape:', df.shape)
print('\nColumns:', df.columns.tolist())
print('\nMissing values per column:')
print(df.isnull().sum())

In [None]:
# Estatísticas descritivas
df.describe().T

In [None]:
# Distribuição da variável target (quality)
import matplotlib.ticker as mtick
counts = df['quality'].value_counts().sort_index()
plt.figure(figsize=(8,4))
plt.bar(counts.index.astype(str), counts.values)
plt.xlabel('Quality score')
plt.ylabel('Count')
plt.title('Distribution of wine quality (raw scores)')
for i,v in enumerate(counts.values):
    plt.text(i, v, str(v), ha='center', va='bottom')
plt.tight_layout()
plt.show()

In [None]:
# Converter em classificação binária: good (>=7) vs not good (<7)
threshold = 7
df['good'] = (df['quality'] >= threshold).astype(int)
print(df['good'].value_counts())
plt.figure(figsize=(6,4))
plt.bar(['not_good','good'], df['good'].value_counts().values)
plt.title(f'Binary classes (threshold = {threshold})')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

In [None]:
# Heatmap de correlação (matplotlib)
corr = df.drop(columns=['good']).corr()
fig, ax = plt.subplots(figsize=(10,8))
cax = ax.imshow(corr, interpolation='nearest', aspect='auto')
ax.set_xticks(np.arange(len(corr.columns)))
ax.set_yticks(np.arange(len(corr.columns)))
ax.set_xticklabels(corr.columns, rotation=90)
ax.set_yticklabels(corr.columns)
fig.colorbar(cax, ax=ax)
ax.set_title('Correlation matrix (features)')
plt.tight_layout()
plt.show()

In [None]:
# Pairwise scatter matrix (subset of features)
subset = ['fixed acidity','volatile acidity','citric acid','residual sugar','alcohol','quality']
pd.plotting.scatter_matrix(df[subset], figsize=(12,12), diagonal='hist')
plt.suptitle('Pairwise scatter (subset of features)')
plt.tight_layout()
plt.show()

In [None]:
# Pré-processamento: separar X/y e dividir treino/teste
X = df.drop(columns=['quality','good'])
y = df['good']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
print('Train shape:', X_train.shape, 'Test shape:', X_test.shape)
print('Train class distribution:\n', y_train.value_counts())


In [None]:
# Definição de modelos e grids para GridSearch
rf_pipeline = Pipeline([('clf', RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced'))])
rf_grid = {
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20],
    'clf__min_samples_split': [2, 5]
}

svm_pipeline = Pipeline([('scaler', StandardScaler()), ('svc', SVC(probability=True, random_state=RANDOM_STATE))])
svm_grid = {
    'svc__C': [0.1, 1, 10],
    'svc__kernel': ['rbf', 'linear'],
    'svc__gamma': ['scale', 'auto']
}

print('Pipelines and grids defined.')

In [None]:
# Função utilitária para treinar com GridSearch e avaliar
def run_gridsearch(pipeline, grid, X_train, y_train, scoring='f1', cv=5):
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=RANDOM_STATE)
    gs = GridSearchCV(pipeline, grid, scoring=scoring, cv=skf, n_jobs=-1, verbose=1, refit=True)
    gs.fit(X_train, y_train)
    return gs

def evaluate(estimator, X_test, y_test):
    y_pred = estimator.predict(X_test)
    try:
        y_proba = estimator.predict_proba(X_test)[:,1]
    except Exception:
        try:
            dec = estimator.decision_function(X_test)
            y_proba = (dec - dec.min()) / (dec.max() - dec.min() + 1e-9)
        except Exception:
            y_proba = None
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'classification_report': classification_report(y_test, y_pred, zero_division=0, output_dict=True),
        'confusion_matrix': confusion_matrix(y_test, y_pred).tolist(),
        'roc_auc': roc_auc_score(y_test, y_proba) if (y_proba is not None) else None
    }
    return metrics

In [None]:
# Treinar Random Forest (GridSearch)
rf_gs = run_gridsearch(rf_pipeline, rf_grid, X_train, y_train, scoring='f1', cv=5)
print('Best RF params:', rf_gs.best_params_)
print('Best RF CV score (f1):', rf_gs.best_score_)

rf_metrics = evaluate(rf_gs.best_estimator_, X_test, y_test)
rf_metrics

In [None]:
# Treinar SVM (GridSearch) - note: scaling in pipeline
svm_gs = run_gridsearch(svm_pipeline, svm_grid, X_train, y_train, scoring='f1', cv=5)
print('Best SVM params:', svm_gs.best_params_)
print('Best SVM CV score (f1):', svm_gs.best_score_)

svm_metrics = evaluate(svm_gs.best_estimator_, X_test, y_test)
svm_metrics

In [None]:
# Comparação e resumo
summary = pd.DataFrame([
    {'model': 'random_forest', **{k: v for k,v in rf_metrics.items() if k in ['accuracy','precision','recall','f1','roc_auc']}},
    {'model': 'svm', **{k: v for k,v in svm_metrics.items() if k in ['accuracy','precision','recall','f1','roc_auc']}},
])
summary.to_csv(OUTPUT_DIR / 'summary_results.csv', index=False)
summary

In [None]:
# Plot confusion matrices
import numpy as np
def plot_cm(cm, title, fname):
    cm = np.array(cm)
    fig, ax = plt.subplots(figsize=(4,4))
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.set_xticks([0,1]); ax.set_yticks([0,1])
    ax.set_xticklabels(['not_good','good']); ax.set_yticklabels(['not_good','good'])
    ax.set_ylabel('True label'); ax.set_xlabel('Predicted label'); ax.set_title(title)
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'), ha='center', va='center',
                    color='white' if cm[i, j] > thresh else 'black')
    plt.tight_layout()
    fig.savefig(OUTPUT_DIR / fname)
    plt.show()

plot_cm(rf_metrics['confusion_matrix'], 'Confusion Matrix - Random Forest', 'cm_rf.png')
plot_cm(svm_metrics['confusion_matrix'], 'Confusion Matrix - SVM', 'cm_svm.png')

In [None]:
# Feature importances (Random Forest)
try:
    rf_est = rf_gs.best_estimator_.named_steps.get('clf')
    importances = rf_est.feature_importances_
    fi = pd.DataFrame({'feature': X.columns, 'importance': importances}).sort_values('importance', ascending=False)
    display(fi)
    fi.to_csv(OUTPUT_DIR / 'feature_importances_rf.csv', index=False)
except Exception as e:
    print('Could not extract feature importances:', e)

In [None]:
# Salvar modelos finalizados
joblib.dump(rf_gs.best_estimator_, OUTPUT_DIR / 'rf_best.joblib')
joblib.dump(svm_gs.best_estimator_, OUTPUT_DIR / 'svm_best.joblib')
print('Models saved to', OUTPUT_DIR)

## Conclusões parciais

- Execute a análise acima e compare as métricas de F1, recall e precisão.
- Se a classe `good` estiver desbalanceada, considere técnicas adicionais (SMOTE, undersampling, ajuste de class_weight).
- Documente no relatório (README e slides) as escolhas de preprocessamento, métrica de otimização e resultados.

### Próximos passos sugeridos

1. Experimentar mais modelos (XGBoost, Logistic Regression).
2. Realizar análise de sensibilidade no threshold para a binarização.
3. Testar pipelines com seleção de features e validação nested CV.
