In [22]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.preprocessing import StandardScaler

In [37]:
df = pd.read_csv('housing.csv')
df.head()
display(df.shape)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


(20640, 10)

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [39]:
#retira todos os dados NA
df.dropna(inplace=True)

#para essa primeira regressão vamos deixar apenas as features númericas
df.drop('ocean_proximity', axis=1, inplace=True)

In [42]:
#posteriormente vamos deixar o target em pandas series, logo vamos tirar o index do df
df.reset_index(inplace=True)
df.drop(columns={'index'}, inplace=True)

## Regressão com Statsmodels

In [44]:
y = df['median_house_value']
X = df.drop('median_house_value', axis=1)

model = sm.OLS(y, X)
res = model.fit()
print(res.summary())


                                 OLS Regression Results                                
Dep. Variable:     median_house_value   R-squared (uncentered):                   0.900
Model:                            OLS   Adj. R-squared (uncentered):              0.900
Method:                 Least Squares   F-statistic:                          2.300e+04
Date:                Tue, 09 Aug 2022   Prob (F-statistic):                        0.00
Time:                        09:56:23   Log-Likelihood:                     -2.5833e+05
No. Observations:               20433   AIC:                                  5.167e+05
Df Residuals:                   20425   BIC:                                  5.167e+05
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [50]:
scaler = StandardScaler()
normalized_X = scaler.fit_transform(X)
normalized_X = pd.DataFrame(normalized_X, columns=X.columns)
normalized_X['intercept'] = 1

model = sm.OLS(y, normalized_X)
res = model.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:     median_house_value   R-squared:                       0.637
Model:                            OLS   Adj. R-squared:                  0.637
Method:                 Least Squares   F-statistic:                     4478.
Date:                Tue, 09 Aug 2022   Prob (F-statistic):               0.00
Time:                        10:02:12   Log-Likelihood:            -2.5682e+05
No. Observations:               20433   AIC:                         5.137e+05
Df Residuals:                   20424   BIC:                         5.137e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
longitude          -8.561e+04   1436

## Regressão com Scikit-learn

In [76]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=1)

model = LinearRegression()
model.fit(Xtrain, ytrain)

pred = model.predict(Xtest)

print(f'Predições: {model.coef_}')

print(f'\n MSE: {np.sqrt(mean_squared_error(ytest, pred))}')


LinearRegression(fit_intercept=False)

Predições: [-2.27250252e+03 -8.50385577e+03  1.79022001e+03 -1.48661494e+01
  7.70389006e+01 -3.95783751e+01  1.31123358e+02  4.54072583e+04]

 MSE: 74742.289948614


Agora vamos realizar a regressão normalizando os dados:

In [78]:
#normalizando os dados
scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled = scaler.transform(Xtest)

model = LinearRegression(fit_intercept=False)
model.fit(Xtrain_scaled, ytrain)

pred = model.predict(Xtest_scaled)

print(f'Predições: {model.coef_}')

print(f' MSE: {np.sqrt(mean_squared_error(ytest, pred))}')

LinearRegression(fit_intercept=False)

Predições: [-85397.44078889 -90605.02604213  14613.21444982 -16963.88280731
  46959.63767603 -42860.88059306  17598.48808835  75423.80895867]
 MSE: 217746.61332666338


Podemos ver que o erro médio melhorou e muito, tanto por ter normalizado os dados, assim como, pelo fato de ter retirado o 'intercept'

### Conclusões

1. Ao utilizar o Statsmodels, vemos que é possível ter uma análise bem mais detalhada sobre o modelo, mas não conseguimos fazer predições como no Scikit-Learn.
2. Colocar um intercept no statsmodel melhorou o R quadrado quando os dados são normalizados
3. Foi possível encontrar os coeficientes parecidos tanto no statsmodel como no scikit-learn;
4. E normalizar os dados no scikit-learn diminuiu o erro medio quadrado do modelo.



