In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.model_selection import RepeatedKFold, train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso

sns.set_theme()

In [2]:
# Carregar dados
data = pd.read_csv("../data/cleaned.csv", parse_dates=["data"])

# Criar features temporais otimizadas
data['data'] = pd.to_datetime(data['data'])
data['dia_semana'] = data['data'].dt.dayofweek 
data['fim_de_semana'] = data['dia_semana'].isin([5,6]).astype(int)

# Definir target e features (versão simplificada)
y = data["pontuacao"]

num_cols = [
    "regularidade", "duracao", "sono_leve_perc", "sono_profundo_perc", "REM_perc",
    "tempo_acordado", "vezes_acordado", "dia_semana", "fim_de_semana"
]

X = data[num_cols]

In [None]:
X

In [None]:
# # Apenas numéricas - escalonamento padrão
# preproc = ColumnTransformer([
#     ('num', StandardScaler(), num_cols)
# ])

# # Validação cruzada otimizada
# cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42)

In [None]:
# def optimize_model(model, param_dist, model_name, X, y):
#     """Função simplificada para otimização de modelos"""
#     pipeline = Pipeline([
#         ("preproc", preproc),
#         ("model", model)
#     ])
    
#     search = RandomizedSearchCV(
#         pipeline, param_distributions=param_dist,
#         n_iter=10, cv=cv, scoring="neg_mean_absolute_error",
#         random_state=42, n_jobs=-1
#     )
    
#     search.fit(X, y)
#     best_model = search.best_estimator_
#     best_mae = -search.best_score_
    
#     print(f"Melhores hiperparâmetros {model_name}: {search.best_params_}")
#     print(f"MAE (CV): {best_mae:.4f}\n")
    
#     return best_model, best_mae

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, 
)

### Lasso

In [7]:
# lasso_model, mae_lasso = optimize_model(
#     model=Lasso(random_state=42),
#     param_dist={"model__alpha": [0.01, 0.1, 1.0, 10], 
#                 "model__max_iter": [500, 1000, 2000]},
#     model_name="Lasso", 
#     X=X, 
#     y=y
# )

# # Treinar melhor modelo
# lasso_model.fit(X_train, y_train)
# y_pred = lasso_model.predict(X_test)

# # Métricas de avaliação
# mae_test = mean_absolute_error(y_test, y_pred)
# mse_test = mean_squared_error(y_test, y_pred)
# mape_test = mean_absolute_percentage_error(y_test, y_pred)
# rmse_test = np.sqrt(mse_test)
# r2_test = r2_score(y_test, y_pred)

# print("\n=== AVALIAÇÃO NO TEST SET ===")
# print(f"MAE: {mae_test:.4f}")
# print(f"MSE: {mse_test:.4f}")
# print(f"MAPE: {mape_test:.4f}")
# print(f"RMSE: {rmse_test:.4f}")
# print(f"R²: {r2_test:.4f}")

# Treinar melhor modelo
lasso_model = Lasso()
lasso_model.fit(X_train, y_train)
y_pred = lasso_model.predict(X_test)

# Métricas de avaliação
mae_test = mean_absolute_error(y_test, y_pred)
mse_test = mean_squared_error(y_test, y_pred)
mape_test = mean_absolute_percentage_error(y_test, y_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_pred)

print("\n=== AVALIAÇÃO NO TEST SET ===")
print(f"MAE: {mae_test:.4f}")
print(f"MSE: {mse_test:.4f}")
print(f"MAPE: {mape_test:.4f}")
print(f"RMSE: {rmse_test:.4f}")
print(f"R²: {r2_test:.4f}")



=== AVALIAÇÃO NO TEST SET ===
MAE: 3.7763
MSE: 20.8802
MAPE: 0.0494
RMSE: 4.5695
R²: 0.6992


### Ridge

In [None]:
ridge_model, mae_ridge = optimize_model(
    model=Ridge(random_state=42),
    param_dist={
        "model__alpha": [0.1, 1.0, 10.0, 50],
        "model__solver": ["auto", "svd", "cholesky", "lsqr"]
    },
    model_name="Ridge", 
    X=X, 
    y=y
)
# Treinar melhor modelo
ridge_model.fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)

# Métricas de avaliação
mae_test = mean_absolute_error(y_test, y_pred)
mse_test = mean_squared_error(y_test, y_pred)
mape_test = mean_absolute_percentage_error(y_test, y_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_pred)

print("\n=== AVALIAÇÃO NO TEST SET ===")
print(f"MAE: {mae_test:.4f}")
print(f"MSE: {mse_test:.4f}")
print(f"MAPE: {mape_test:.4f}")
print(f"RMSE: {rmse_test:.4f}")
print(f"R²: {r2_test:.4f}")

### HistGradient Boosting Regressor

In [None]:
hgb_model, mae_hgb = optimize_model(
    model=HistGradientBoostingRegressor(random_state=42),
    param_dist={
        "model__max_iter": [100, 300, 500],
        "model__max_depth": [3, 5, 10, None],
        "model__min_samples_leaf": [20, 50, 100],
        "model__learning_rate": [0.01, 0.1, 0.2]
    },
    model_name="HGB",
    X=X,
    y=y
)
# Treinar melhor modelo
hgb_model.fit(X_train, y_train)
y_pred = hgb_model.predict(X_test)

# Métricas de avaliação
mae_test = mean_absolute_error(y_test, y_pred)
mse_test = mean_squared_error(y_test, y_pred)
mape_test = mean_absolute_percentage_error(y_test, y_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_pred)

print("\n=== AVALIAÇÃO NO TEST SET ===")
print(f"MAE: {mae_test:.4f}")
print(f"MSE: {mse_test:.4f}")
print(f"MAPE: {mape_test:.4f}")
print(f"RMSE: {rmse_test:.4f}")
print(f"R²: {r2_test:.4f}")

In [None]:
model = HistGradientBoostingRegressor(
    random_state=42
)

# Treinar melhor modelo
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Métricas de avaliação
mae_test = mean_absolute_error(y_test, y_pred)
mse_test = mean_squared_error(y_test, y_pred)
mape_test = mean_absolute_percentage_error(y_test, y_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_pred)

print("\n=== AVALIAÇÃO NO TEST SET ===")
print(f"MAE: {mae_test:.4f}")
print(f"MSE: {mse_test:.4f}")
print(f"MAPE: {mape_test:.4f}")
print(f"RMSE: {rmse_test:.4f}")
print(f"R²: {r2_test:.4f}")

### Random Forest Regressor

In [None]:
random_forest_model, mae_random_forest = optimize_model(
    model=RandomForestRegressor(random_state=42),
    param_dist={
        "model__n_estimators": [100, 300, 500],
        "model__max_depth": [None, 5, 10, 20],
        "model__min_samples_leaf": [1, 2, 4],
        "model__max_features": ["sqrt", "log2", 0.5]
    },
    model_name="HGB",
    X=X,
    y=y
)

### Linear Regression

In [None]:
linear_pipeline = Pipeline([
    ("preproc", preproc),
    ("model", LinearRegression())
])

linear_scores = -cross_val_score(
    linear_pipeline, X, y, cv=cv, 
    scoring="neg_mean_absolute_error"
)
mae_linear = linear_scores.mean()
print(f"Linear Regression MAE (CV): {mae_linear:.4f}\n")
linear_model = LinearRegression()


### Comparação de Modelos

In [None]:
# Comparar desempenho
models_comparison = {
    "Linear Regression": mae_linear,
    "Lasso": mae_lasso,
    "Ridge": mae_ridge,
    "HistGradientBoosting": mae_hgb,
    "Random Forest": mae_random_forest
}

# Guardar os modelos em outro dicionário
trained_models = {
    "Linear Regression": linear_model,
    "Lasso": lasso_model,
    "Ridge": ridge_model,
    "HistGradientBoosting": hgb_model,
    "Random Forest": random_forest_model
}

# Identificar o melhor
best_model_name = min(models_comparison, key=models_comparison.get)
best_model = trained_models[best_model_name]

print("=== COMPARAÇÃO DE MODELOS ===")
for model, mae in models_comparison.items():
    print(f"{model}: {mae:.4f}")

print(f"\nMELHOR MODELO: {best_model_name}")


In [None]:
# Dividir dados (mantendo ordem temporal)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, 
)

# Treinar melhor modelo
hgb_model.fit(X_train, y_train)
y_pred = hgb_model.predict(X_test)

# Métricas de avaliação
mae_test = mean_absolute_error(y_test, y_pred)
mse_test = mean_squared_error(y_test, y_pred)
mape_test = mean_absolute_percentage_error(y_test, y_pred)
rmse_test = np.sqrt(mse_test)
r2_test = r2_score(y_test, y_pred)

print("\n=== AVALIAÇÃO NO TEST SET ===")
print(f"MAE: {mae_test:.4f}")
print(f"MSE: {mse_test:.4f}")
print(f"MAPE: {mape_test:.4f}")
print(f"RMSE: {rmse_test:.4f}")
print(f"R²: {r2_test:.4f}")

In [None]:
# Gráfico de dispersão: Previsões vs Valores Reais
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
plt.xlabel('Valor Real')
plt.ylabel('Predição')
plt.title(f'Predições vs Valores Reais - {best_model_name}')
plt.grid(True, alpha=0.3)
plt.show()

# Gráfico de resíduos
residuals = y_test - y_pred
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.7)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predições')
plt.ylabel('Resíduos')
plt.title('Análise de Resíduos')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
def predict_sono_quality(model, features_dict):
    """Função para fazer previsões com novo dados"""
    # Converter para DataFrame
    input_df = pd.DataFrame([features_dict])
    
    # Fazer previsão
    prediction = model.predict(input_df)[0]
    
    print(f"Previsão de pontuação de sono: {prediction:.1f}/100")
    return prediction


In [8]:
# Verifica se o modelo tem coeficientes ou feature_importances_
if hasattr(best_model.named_steps["model"], "coef_"):
    importances = best_model.named_steps["model"].coef_
elif hasattr(best_model.named_steps["model"], "feature_importances_"):
    importances = best_model.named_steps["model"].feature_importances_
else:
    print("Modelo não possui coeficientes nem feature_importances_")
    importances = None

if importances is not None:
    # Cria DataFrame com importâncias
    feat_imp = pd.DataFrame({
        "feature": num_cols,
        "importance": importances
    }).sort_values("importance", key=abs, ascending=False)
    
    # Limita às 15 features mais importantes
    feat_imp = feat_imp.head(15)
    
    # Cria o gráfico
    plt.figure(figsize=(10, 8))
    colors = ['red' if x < 0 else 'green' for x in feat_imp['importance']]
    
    sns.barplot(data=feat_imp, x="importance", y="feature", palette=colors)
    plt.title("Importância das Variáveis", fontsize=14, fontweight='bold')
    plt.xlabel("Importância", fontsize=12)
    plt.ylabel("Variáveis", fontsize=12)
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.tight_layout()
    plt.show()

NameError: name 'best_model' is not defined

In [None]:
# # Exemplo de uso:
# exemplo_dados = {
#     "regularidade": 45,
#     "duracao": 500,
#     "sono_leve_perc": 65.0,
#     "sono_profundo_perc": 15.0,
#     "REM_perc": 20.0,
#     "tempo_acordado": 10,
#     "vezes_acordado": 2,
#     "ds_sin": 0.0,
#     "ds_cos": 1.0,
#     "fim_de_semana": 1
# }
# predict_sono_quality(best_model, exemplo_dados)

In [None]:
print("=" * 50)
print("RESUMO DA MODELAGEM")
print("=" * 50)
print(f"Melhor modelo: {best_model_name}")
print(f"MAE Cross-Validation: {models_comparison[best_model_name]:.4f}")
print(f"MAE Test Set: {mae_test:.4f}")
print(f"R² Score: {r2_test:.4f}")
print(f"Número de features: {len(num_cols)}")
print("=" * 50)