In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl 
import math
import scipy
from scipy.stats.stats import pearsonr

import sys
sys.path.append("../")
from credit_tools.tools import *

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [12]:
from sklearn.datasets import load_boston
boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
dataset

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [13]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [17]:
y = dataset['target']
X = dataset['RM']
X = sm.add_constant(X)
X.head()

Unnamed: 0,const,RM
0,1.0,6.575
1,1.0,6.421
2,1.0,7.185
3,1.0,6.998
4,1.0,7.147


In [18]:
linear_regression = sm.OLS(y, X)
linear_regression

<statsmodels.regression.linear_model.OLS at 0x1242346d0>

In [19]:
fitted_model = linear_regression.fit()
fitted_model

<statsmodels.regression.linear_model.RegressionResultsWrapper at 0x1242854d0>

In [20]:
fitted_model.summary()

0,1,2,3
Dep. Variable:,target,R-squared:,0.484
Model:,OLS,Adj. R-squared:,0.483
Method:,Least Squares,F-statistic:,471.8
Date:,"Thu, 21 May 2020",Prob (F-statistic):,2.49e-74
Time:,07:25:25,Log-Likelihood:,-1673.1
No. Observations:,506,AIC:,3350.0
Df Residuals:,504,BIC:,3359.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-34.6706,2.650,-13.084,0.000,-39.877,-29.465
RM,9.1021,0.419,21.722,0.000,8.279,9.925

0,1,2,3
Omnibus:,102.585,Durbin-Watson:,0.684
Prob(Omnibus):,0.0,Jarque-Bera (JB):,612.449
Skew:,0.726,Prob(JB):,1.02e-133
Kurtosis:,8.19,Cond. No.,58.4


In [22]:
fitted_model.params

const   -34.670621
RM        9.102109
dtype: float64

In [24]:
betas = np.array(fitted_model.params)
betas

array([-34.67062078,   9.10210898])

In [26]:
fitted_values = fitted_model.predict(X)
fitted_values

0      25.175746
1      23.774021
2      30.728032
3      29.025938
4      30.382152
         ...    
501    25.339584
502    21.034286
503    28.825691
504    27.169108
505    20.215096
Length: 506, dtype: float64

In [29]:
mean_sum_squared_errors = np.sum((dataset['target'] - dataset['target'].mean())**2)
regr_sum_squared_errors = np.sum((dataset['target'] - fitted_values)**2)
(mean_sum_squared_errors - regr_sum_squared_errors) / mean_sum_squared_errors

0.4835254559913342

In [31]:
(pearsonr(dataset['RM'], dataset['target'])[0])**2

0.483525455991334

In [32]:
9.1021 * 4.55 - 34.6706

6.743955

In [34]:
9.1021 * 5.55 - 34.6706

15.846055

In [35]:
(np.min(dataset['RM']), np.max(dataset['RM']))

(3.561, 8.78)

In [55]:
residuals = dataset['target'] - fitted_values
normalized_residuals = standardize(residuals)
normalized_residuals

0     -1.175746
1     -2.174021
2      3.971968
3      4.374062
4      5.817848
         ...   
501   -2.939584
502   -0.434286
503   -4.925691
504   -5.169108
505   -8.315096
Length: 506, dtype: float64