In [1]:
#Import statsmodel, pandas, and numpy
import numpy as np
from sklearn.datasets import load_boston
import pandas as pd
from pandas import DataFrame
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression as lm

In [2]:
#Import Boston datasets from scikit-learn datasets and load it
from sklearn.datasets import load_boston
boston = load_boston()

In [3]:
#Describe the dataset
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
#Define the data/predictors as the pre-set feature names
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [5]:
#Put the target (housing value -- MEDV) in another DataFrame
#Print first five rows
df2 = pd.DataFrame(boston.target,columns=['MEDV'])
df2.head()

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [6]:
#Define 'RM' as X or the feature and 'MEDV' as y or the label
x = df['RM']
y = df2['MEDV']

In [7]:
#Build the model and define 'predictions'
lr=sm.OLS(y,x).fit()
p=lr.predict(x)
print('The predicted values are\n',p)

The predicted values are
 0      24.020779
1      23.458163
2      26.249323
3      25.566146
4      26.110495
         ...    
501    24.086539
502    22.358504
503    25.485772
504    24.820863
505    22.029703
Length: 506, dtype: float64


In [8]:
#Print out the statistics
lr.summary()

0,1,2,3
Dep. Variable:,MEDV,R-squared (uncentered):,0.901
Model:,OLS,Adj. R-squared (uncentered):,0.901
Method:,Least Squares,F-statistic:,4615.0
Date:,"Thu, 06 Oct 2022",Prob (F-statistic):,3.7399999999999996e-256
Time:,01:14:00,Log-Likelihood:,-1747.1
No. Observations:,506,AIC:,3496.0
Df Residuals:,505,BIC:,3500.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
RM,3.6534,0.054,67.930,0.000,3.548,3.759

0,1,2,3
Omnibus:,83.295,Durbin-Watson:,0.493
Prob(Omnibus):,0.0,Jarque-Bera (JB):,152.507
Skew:,0.955,Prob(JB):,7.649999999999999e-34
Kurtosis:,4.894,Cond. No.,1.0


In [9]:
#Use 'RM' and 'LSTAT' as features and keep 'MEDV' as the target - Define the model, define 'predictions', and print the summary -What does a coefficient of determination of R2= .948 mean?
x1 = df[['RM','LSTAT']]
y1 = df2['MEDV']

lr1=sm.OLS(y1,x1).fit()
p1=lr1.predict(x1)
print('The predicted values are\n',p1)

print(lr1.summary())
print(lr1.params)

The predicted values are
 0      28.997322
1      25.513780
2      32.613488
3      32.410653
4      31.574564
         ...    
501    26.010226
502    24.076146
503    30.532203
504    29.088325
505    24.421412
Length: 506, dtype: float64
                                 OLS Regression Results                                
Dep. Variable:                   MEDV   R-squared (uncentered):                   0.948
Model:                            OLS   Adj. R-squared (uncentered):              0.948
Method:                 Least Squares   F-statistic:                              4637.
Date:                Thu, 06 Oct 2022   Prob (F-statistic):                        0.00
Time:                        01:14:00   Log-Likelihood:                         -1582.9
No. Observations:                 506   AIC:                                      3170.
Df Residuals:                     504   BIC:                                      3178.
Df Model:                           2                  

The coefficient of determination is a number between 0 and 1 that measures how well a statistical model predicts an outcome.
In this instance, it means that variations in the independent variables "RM" and "LSTAT" may account for variations in the MEDV variable 94.8% of the time.

Is there any autocorrelation? If not, why?

-> As the value of Durbin-Watson (0.834) is leass than 2, there is a positive auto correlation.

Are the data coming from a normal distribution? Why?

-> Because the Prob(JB) value is 9.70e-97, which is nearly zero but not quite, this data does not come from a normal distribution. Additionally, the data is only regularly distributed if the value of rob(JB) is equal to 0. Since the data has a Skew value larger than 0.5, we can see that it is skewed to the right.

In [10]:
#Define the data/predictors as 'RM' and 'LSTAT'
df3 = df[['RM','LSTAT']]

In [11]:
#Put the target (housing value -- MEDV) in another DataFrame
df4=pd.DataFrame(boston.target,columns=['MEDV'])

In [12]:
#Define the linear model as lm and fit
reg = lm().fit(df3, df4)

In [13]:
#Print the predictions
reg.predict(df3)

array([[28.94101368],
       [25.48420566],
       [32.65907477],
       [32.40652   ],
       [31.63040699],
       [28.05452701],
       [21.28707846],
       [17.78559653],
       [ 8.10469338],
       [18.24650673],
       [17.99496223],
       [20.73221309],
       [18.5534842 ],
       [23.64474107],
       [23.10895823],
       [22.9239452 ],
       [24.65257604],
       [19.73611045],
       [18.9297215 ],
       [20.57377596],
       [13.51732408],
       [20.14832175],
       [17.90896697],
       [15.48764606],
       [18.35281036],
       [16.56210901],
       [18.74440281],
       [18.34995811],
       [23.51018847],
       [24.94888935],
       [13.23095259],
       [21.20092715],
       [11.15596625],
       [15.89983805],
       [16.63398622],
       [22.65107562],
       [21.07107521],
       [22.81275431],
       [22.53014238],
       [29.46686594],
       [33.15564849],
       [30.0244275 ],
       [26.33937234],
       [25.50630935],
       [23.42747337],
       [21

In [14]:
#Print the coefficient of determination
reg.score(df3,df4)

0.6385616062603404

In [15]:
#Print intercept and coefficients
print("Intercept: ",reg.intercept_)
print("Coefficients: ",reg.coef_)

Intercept:  [-1.35827281]
Coefficients:  [[ 5.09478798 -0.64235833]]
