# CP02_002 — Classificação (Solar Radiation) & Regressão (Wind Turbine SCADA)
Este notebook atende ao enunciado CP02_002 com **splits exigidos**:
- Solar (classificação): **70/30**, alvo = Alta/Baixa por **mediana**.
- Eólica (regressão): **80/20**.


In [None]:

# Imports
import os, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import r2_score, mean_squared_error

import matplotlib.pyplot as plt

RANDOM_STATE = 42
def rmse(y_true, y_pred): return mean_squared_error(y_true, y_pred, squared=False)


## Exercício 1 — Classificação (Solar Radiation)
- **Alvo**: Alta/Baixa pela **mediana** da coluna de radiação.
- **Split**: 70/30.
- **Modelos**: DecisionTree, RandomForest, **SVM com StandardScaler**.
- **Métricas**: acurácia + matriz de confusão.

In [None]:

solar_path = "data/solar/solar_radiation.csv"
if not os.path.exists(solar_path):
    print("AVISO: arquivo real não encontrado:", solar_path)
    print("Gerando dados sintéticos com coluna 'Radiation'.")
    rng = np.random.RandomState(7)
    df = pd.DataFrame({
        "Temp": rng.normal(25, 5, 4000),
        "Pressure": rng.normal(1010, 7, 4000),
        "Humidity": rng.normal(55, 12, 4000),
        "WindSpeed": rng.normal(3.0, 1.0, 4000),
        "Radiation": rng.gamma(3., 100., 4000)
    })
else:
    df = pd.read_csv(solar_path)
    # Tenta achar a coluna de radiação por nomes comuns
    rad_cols = [c for c in df.columns if c.lower() in ["radiation","solar_radiation","radiacao","radiacao_solar","radiación"]]
    if not rad_cols:
        raise ValueError("Não encontrei coluna de radiação (ex.: 'Radiation').")
    # renomeia para 'Radiation' para simplificar
    df = df.rename(columns={rad_cols[0]:"Radiation"})

# Cria alvo pela mediana
median_rad = df["Radiation"].median()
y = (df["Radiation"] >= median_rad).astype(int)
X = df.drop(columns=["Radiation"])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=RANDOM_STATE, stratify=y)

models = {
    "DecisionTree": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE),
    "SVM": Pipeline([("scaler", StandardScaler()), ("svc", SVC())])
}

rows = []
for name, clf in models.items():
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    rows.append({"modelo": name, "accuracy": accuracy_score(y_test, pred)})

res_solar = pd.DataFrame(rows).sort_values("accuracy", ascending=False)
display(res_solar)

best_name = res_solar.iloc[0]["modelo"]
best = models[best_name]
pred_best = best.predict(X_test)
cm = confusion_matrix(y_test, pred_best)
ConfusionMatrixDisplay(cm).plot()
plt.title(f"Matriz de confusão — Solar ({best_name})")
plt.show()

print("Mediana usada para o alvo (Radiation):", median_rad)


## Exercício 2 — Regressão (Wind Turbine SCADA)
- **Split**: 80/20.
- **Modelos**: Linear, Decision Tree Regressor, Random Forest Regressor.
- **Métricas**: R² + RMSE.

In [None]:

wind_path = "data/wind/wind_turbine_scada.csv"
if not os.path.exists(wind_path):
    print("AVISO: arquivo real não encontrado:", wind_path)
    print("Gerando dados sintéticos com colunas típicas.")
    rng = np.random.RandomState(9)
    dfw = pd.DataFrame({
        "Wind Speed (m/s)": rng.uniform(0, 25, 5000),
        "Theoretical_Power_Curve (kWh)": rng.uniform(0, 3500, 5000),
        "Wind Direction (°)": rng.uniform(0, 360, 5000)
    })
    # Potência com ruído
    dfw["LV ActivePower (kW)"] = 0.5*dfw["Theoretical_Power_Curve (kWh)"] + 30*dfw["Wind Speed (m/s)"] + rng.normal(0, 100, 5000)
else:
    dfw = pd.read_csv(wind_path)
    # limpeza leve de colunas
    dfw.columns = dfw.columns.str.strip()
    target_col = "LV ActivePower (kW)"
    if target_col not in dfw.columns:
        raise ValueError(f"Coluna alvo '{target_col}' não encontrada no CSV.")

feature_candidates = [c for c in dfw.columns if c != "LV ActivePower (kW)"]
X = dfw[feature_candidates].select_dtypes(include=[np.number])
y = dfw["LV ActivePower (kW)"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=RANDOM_STATE)

reg_models = {
    "LinearRegression": LinearRegression(),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=RANDOM_STATE),
    "RandomForestRegressor": RandomForestRegressor(n_estimators=300, random_state=RANDOM_STATE)
}

rows = []
for name, reg in reg_models.items():
    reg.fit(X_train, y_train)
    pred = reg.predict(X_test)
    rows.append({"modelo": name, "R2": r2_score(y_test, pred), "RMSE": rmse(y_test, pred)})

res_wind = pd.DataFrame(rows).sort_values("R2", ascending=False)
res_wind
