In [4]:
import pandas as pd
import statsmodels.api as sm
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
# 1. Carregar dataset
data = fetch_california_housing()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)

In [6]:
# 2. Dividir em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Padronizar features (não obrigatório, mas recomendado)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 4. Adicionar intercepto (statsmodels não adiciona automaticamente)
X_train_scaled = sm.add_constant(X_train_scaled)
X_test_scaled = sm.add_constant(X_test_scaled)

# 5. Definir e ajustar o modelo GLM (com família Gaussian, link identidade)
glm_model = sm.GLM(y_train, X_train_scaled, family=sm.families.Gaussian(sm.families.links.identity()))
glm_results = glm_model.fit()

# 6. Previsão e avaliação
y_pred = glm_results.predict(X_test_scaled)

print(glm_results.summary())  # resumo do modelo

print("MSE:", mean_squared_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))



                 Generalized Linear Model Regression Results                  
Dep. Variable:                      y   No. Observations:                16512
Model:                            GLM   Df Residuals:                    16503
Model Family:                Gaussian   Df Model:                            8
Link Function:               identity   Scale:                         0.51822
Method:                          IRLS   Log-Likelihood:                -17998.
Date:                Fri, 01 Aug 2025   Deviance:                       8552.1
Time:                        17:53:16   Pearson chi2:                 8.55e+03
No. Iterations:                     3   Pseudo R-squ. (CS):             0.7941
Covariance Type:            nonrobust                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.0719      0.006    369.848      0.0