# Conducting linear regression in python

In [1]:
import numpy as np
import scipy.stats as stats


from matplotlib import rcParams

In [1]:
import sklearn
from sklearn.datasets import load_boston
boston = load_boston()

## Data Set Characteristics
(Taken from boston['DESCR'])
- CRIM:     per capita crime rate by town        
- ZN:       proportion of residential land zoned for lots over 25,000 sq.ft.        
- INDUS:    proportion of non-retail business acres per town        
- CHAS:     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)        
- NOX:      nitric oxides concentration (parts per 10 million)        
- RM:       average number of rooms per dwelling        
- AGE:      proportion of owner-occupied units built prior tproo 1940        
- DIS:      weighted distances to five Boston employment centres       
- RAD:      index of accessibility to radial highways        
- TAX:      full-value property-tax rate per \$10,000
- PTRATIO:  pupil-teacher ratio by town       
- B:        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
- LSTAT:    lower status of the population        
- MEDV:     Median value of owner-occupied homes in \$1000's

In [12]:
import pandas as pd
boston_df = pd.DataFrame(boston.data)
boston_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [13]:
#rename the columns
boston_df.columns = boston['feature_names']
boston_df['MEDV'] = boston['target']
boston_df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


## Conducting the regression

In [18]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
#stats models does not automatically include an intercept
#stats model has a function for adding a constant value. Although I could easily just do this with:

model = smf.ols(formula='MEDV ~ CRIM + CHAS', data=boston_df)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                   MEDV   R-squared:                       0.174
Model:                            OLS   Adj. R-squared:                  0.171
Method:                 Least Squares   F-statistic:                     53.14
Date:                Mon, 18 Nov 2019   Prob (F-statistic):           1.16e-21
Time:                        10:35:46   Log-Likelihood:                -1791.7
No. Observations:                 506   AIC:                             3589.
Df Residuals:                     503   BIC:                             3602.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     23.6140      0.419     56.409      0.0

# Making the plots in an R OLS
1. Residuals vs. Fitted
2. Normal QQ plot
3. Scale - Location Plot
4. Residuals vs. Leverage

### 1. Residuals vs. Fitted

In [19]:
residuals = results.resid



<bound method RegressionModel.predict of <statsmodels.regression.linear_model.OLS object at 0x000001F6E229CD88>>

In [None]:

import matplotlib.pyplot as plt