# CP02_001 — Regressão (Appliances) & Classificação (Smart Grid Stability)
Este notebook atende ao enunciado CP02_001: regressão com **Appliances Energy Prediction** e classificação com **Smart Grid Stability**. Métricas: R², RMSE, MAE (regressão); acurácia, F1-score, matriz de confusão (classificação).

**Estrutura de dados esperada:**
- `data/appliances/energydata_complete.csv`
- `data/smart_grid/smart_grid_stability.csv`


In [None]:

# Imports comuns
import os, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

RANDOM_STATE = 42

def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)


## Parte 1 — Regressão (Appliances Energy Prediction)
Modelos: **LinearRegression**, **DecisionTreeRegressor**, **RandomForestRegressor**. Métricas: **R²**, **RMSE**, **MAE**.

In [None]:

appliances_path = "data/appliances/energydata_complete.csv"
if not os.path.exists(appliances_path):
    print("AVISO: arquivo real não encontrado:", appliances_path)
    print("Gerando dados sintéticos somente para demonstrar o pipeline.")
    rng = np.random.RandomState(0)
    X = pd.DataFrame({
        "T1": rng.normal(20, 3, 2000),
        "RH_1": rng.normal(45, 8, 2000),
        "T_out": rng.normal(15, 5, 2000),
        "Press_mm_hg": rng.normal(760, 10, 2000),
        "Windspeed": rng.normal(3.5, 1, 2000),
    })
    y = (0.8*X["T1"] - 0.5*X["RH_1"] + 0.2*X["T_out"] + rng.normal(0,2,2000))*10 + 100
else:
    df = pd.read_csv(appliances_path)
    # Heurística de seleção de features comuns no dataset UCI
    # Remove colunas não numéricas ou timestamp
    candidates = df.select_dtypes(include=[np.number]).columns.tolist()
    if "Appliances" not in df.columns:
        raise ValueError("Coluna alvo 'Appliances' não encontrada no CSV.")
    y = df["Appliances"]
    X = df[candidates].drop(columns=[c for c in ["Appliances"] if c in candidates], errors="ignore")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

models = {
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=RANDOM_STATE),
    "RandomForestRegressor": RandomForestRegressor(n_estimators=200, random_state=RANDOM_STATE)
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    results.append({
        "modelo": name,
        "R2": r2_score(y_test, pred),
        "RMSE": rmse(y_test, pred),
        "MAE": mean_absolute_error(y_test, pred)
    })
res_appliances = pd.DataFrame(results).sort_values("R2", ascending=False)
res_appliances


## Parte 2 — Classificação (Smart Grid Stability)
Modelos: **DecisionTreeClassifier**, **KNeighborsClassifier**, **LogisticRegression**. Métricas: **acurácia**, **F1**, **matriz de confusão**.

In [None]:

smart_path = "data/smart_grid/smart_grid_stability.csv"
if not os.path.exists(smart_path):
    print("AVISO: arquivo real não encontrado:", smart_path)
    print("Gerando dados sintéticos de estabilidade de rede.")
    rng = np.random.RandomState(1)
    X = pd.DataFrame({
        "P_active": rng.normal(50, 15, 2000),
        "P_reactive": rng.normal(20, 5, 2000),
        "Voltage": rng.normal(220, 5, 2000),
        "Current": rng.normal(10, 3, 2000),
    })
    # Alvo binário (0=estável,1=instável)
    y = ((X["P_active"] - 0.5*X["P_reactive"] + rng.normal(0,5,2000)) > 50).astype(int)
else:
    df = pd.read_csv(smart_path)
    # Detecta coluna alvo típica
    target_candidates = [c for c in df.columns if c.lower() in ["stabf","stable","target","label","class"]]
    if not target_candidates:
        raise ValueError("Não encontrei a coluna alvo (ex.: 'stabf', 'stable').")
    y = df[target_candidates[0]]
    # Transforma alvo textual em binário se necessário
    if y.dtype == object:
        y = y.astype("category").cat.codes
    X = df.drop(columns=[target_candidates[0]])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

clf_models = {
    "DecisionTree": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "KNN": KNeighborsClassifier(n_neighbors=7),
    "LogisticRegression": Pipeline([("scaler", StandardScaler()), ("clf", LogisticRegression(max_iter=1000))])
}

rows = []
for name, clf in clf_models.items():
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    rows.append({
        "modelo": name,
        "accuracy": accuracy_score(y_test, pred),
        "f1": f1_score(y_test, pred, average="weighted"),
    })

res_smart = pd.DataFrame(rows).sort_values("f1", ascending=False)
display(res_smart)

# Matriz de confusão do melhor
best_name = res_smart.iloc[0]["modelo"]
best = clf_models[best_name]
pred_best = best.predict(X_test)
cm = confusion_matrix(y_test, pred_best)
disp = ConfusionMatrixDisplay(cm)
disp.plot()
plt.title(f"Matriz de confusão — {best_name}")
plt.show()
