In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn import datasets

In [11]:
data = datasets.load_boston()
print(data.DESCR)
print(data.feature_names)
print(data.target)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [12]:
# Independent variables
df = pd.DataFrame(data.data, columns = data.feature_names)
print(df)

         CRIM    ZN  INDUS  CHAS    NOX     RM    AGE     DIS   RAD    TAX  \
0     0.00632  18.0   2.31   0.0  0.538  6.575   65.2  4.0900   1.0  296.0   
1     0.02731   0.0   7.07   0.0  0.469  6.421   78.9  4.9671   2.0  242.0   
2     0.02729   0.0   7.07   0.0  0.469  7.185   61.1  4.9671   2.0  242.0   
3     0.03237   0.0   2.18   0.0  0.458  6.998   45.8  6.0622   3.0  222.0   
4     0.06905   0.0   2.18   0.0  0.458  7.147   54.2  6.0622   3.0  222.0   
5     0.02985   0.0   2.18   0.0  0.458  6.430   58.7  6.0622   3.0  222.0   
6     0.08829  12.5   7.87   0.0  0.524  6.012   66.6  5.5605   5.0  311.0   
7     0.14455  12.5   7.87   0.0  0.524  6.172   96.1  5.9505   5.0  311.0   
8     0.21124  12.5   7.87   0.0  0.524  5.631  100.0  6.0821   5.0  311.0   
9     0.17004  12.5   7.87   0.0  0.524  6.004   85.9  6.5921   5.0  311.0   
10    0.22489  12.5   7.87   0.0  0.524  6.377   94.3  6.3467   5.0  311.0   
11    0.11747  12.5   7.87   0.0  0.524  6.009   82.9  6.2267   

In [13]:
# Dependent variables that the regression model will predict
target = pd.DataFrame(data.target, columns = ["MEDV"])
print(target)

     MEDV
0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
5    28.7
6    22.9
7    27.1
8    16.5
9    18.9
10   15.0
11   18.9
12   21.7
13   20.4
14   18.2
15   19.9
16   23.1
17   17.5
18   20.2
19   18.2
20   13.6
21   19.6
22   15.2
23   14.5
24   15.6
25   13.9
26   16.6
27   14.8
28   18.4
29   21.0
..    ...
476  16.7
477  12.0
478  14.6
479  21.4
480  23.0
481  23.7
482  25.0
483  21.8
484  20.6
485  21.2
486  19.1
487  20.6
488  15.2
489   7.0
490   8.1
491  13.6
492  20.1
493  21.8
494  24.5
495  23.1
496  19.7
497  18.3
498  21.2
499  17.5
500  16.8
501  22.4
502  20.6
503  23.9
504  22.0
505  11.9

[506 rows x 1 columns]


# Single variable regression

In [17]:
# Correlation without a constant
X = df["RM"]
y = target["MEDV"]

# Generate a model using the ordinary least squares method
model = sm.OLS(y, X).fit()
predictions = model.predict(X) # Make the predictions by the model

# Print a summary of the model
model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared (uncentered):,0.901
Model:,OLS,Adj. R-squared (uncentered):,0.901
Method:,Least Squares,F-statistic:,4615.0
Date:,"Fri, 04 Oct 2019",Prob (F-statistic):,3.7399999999999996e-256
Time:,19:57:16,Log-Likelihood:,-1747.1
No. Observations:,506,AIC:,3496.0
Df Residuals:,505,BIC:,3500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,3.6534,0.054,67.930,0.000,3.548,3.759

0,1,2,3
Omnibus:,83.295,Durbin-Watson:,0.493
Prob(Omnibus):,0.0,Jarque-Bera (JB):,152.507
Skew:,0.955,Prob(JB):,7.649999999999999e-34
Kurtosis:,4.894,Cond. No.,1.0


In [19]:
# Generate a model with correlation constants

X = df["RM"]
y = target["MEDV"]
X = sm.add_constant(X) # Add a y-intercept to the model

model = sm.OLS(y, X).fit()
predictions = model.predict(X)

model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared:,0.484
Model:,OLS,Adj. R-squared:,0.483
Method:,Least Squares,F-statistic:,471.8
Date:,"Fri, 04 Oct 2019",Prob (F-statistic):,2.49e-74
Time:,19:58:22,Log-Likelihood:,-1673.1
No. Observations:,506,AIC:,3350.0
Df Residuals:,504,BIC:,3359.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-34.6706,2.650,-13.084,0.000,-39.877,-29.465
RM,9.1021,0.419,21.722,0.000,8.279,9.925

0,1,2,3
Omnibus:,102.585,Durbin-Watson:,0.684
Prob(Omnibus):,0.0,Jarque-Bera (JB):,612.449
Skew:,0.726,Prob(JB):,1.02e-133
Kurtosis:,8.19,Cond. No.,58.4


# Multivariable regression

In [20]:
X = df[["RM", "LSTAT"]]
y = target["MEDV"]

model = sm.OLS(y, X).fit()
predictions = model.predict(X)

model.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared (uncentered):,0.948
Model:,OLS,Adj. R-squared (uncentered):,0.948
Method:,Least Squares,F-statistic:,4637.0
Date:,"Fri, 04 Oct 2019",Prob (F-statistic):,0.0
Time:,19:58:40,Log-Likelihood:,-1582.9
No. Observations:,506,AIC:,3170.0
Df Residuals:,504,BIC:,3178.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,4.9069,0.070,69.906,0.000,4.769,5.045
LSTAT,-0.6557,0.031,-21.458,0.000,-0.716,-0.596

0,1,2,3
Omnibus:,145.153,Durbin-Watson:,0.834
Prob(Omnibus):,0.0,Jarque-Bera (JB):,442.157
Skew:,1.351,Prob(JB):,9.7e-97
Kurtosis:,6.698,Cond. No.,4.72


In [21]:
print(predictions)

0      28.997322
1      25.513780
2      32.613488
3      32.410653
4      31.574564
5      28.135001
6      21.349471
7      17.728003
8       8.004490
9      18.247910
10     17.881457
11     20.783929
12     18.595095
13     23.774772
14     23.184607
15     23.072772
16     24.807718
17     19.772662
18     19.106479
19     20.705104
20     13.547812
21     20.200811
22     17.862764
23     15.487734
24     18.379950
25     16.647500
26     18.812336
27     18.340874
28     23.476883
29     24.892926
         ...    
476    19.567156
477     9.691747
478    18.526222
479    21.968367
480    23.586260
481    28.046188
482    30.050926
483    21.440782
484    20.060874
485    24.034662
486    20.177839
487    21.467057
488    14.919601
489    10.847902
490     5.528510
491    17.508797
492    20.603890
493    20.128276
494    20.166819
495    16.281134
496    12.585880
497    19.184680
498    21.062507
499    17.424886
500    20.177169
501    26.010226
502    24.076146
503    30.5322