# Linear regression using diabetes dataset

In [2]:
# install related modules; comment after first use
# pip install sklearn
# pip install statsmodels

In [3]:
# import related modules
import pandas as pd
import numpy as np
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

ModuleNotFoundError: No module named 'statsmodels'

## Method 1: use statistics pakages

In [None]:
diabetes = datasets.load_diabetes()
diabetes

In [22]:
# load data; X the predictors and y the response are ndarray
diabetes = datasets.load_diabetes()
X = diabetes.data
y = diabetes.target

X2 = sm.add_constant(X) # add an entry of one in each row of X for beta_0

ols = sm.OLS(y, X2) #ordinary least square
est = ols.fit() # the learned linear model
print(est.summary())

<class 'numpy.ndarray'> (442, 10)
<class 'numpy.ndarray'> (442,)


In [35]:
ols = sm.OLS(y, X2) #ordinary least square
est = ols.fit() # the learned linear model
print(est.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.518
Model:                            OLS   Adj. R-squared:                  0.507
Method:                 Least Squares   F-statistic:                     46.27
Date:                Wed, 31 Aug 2022   Prob (F-statistic):           3.83e-62
Time:                        22:53:12   Log-Likelihood:                -2386.0
No. Observations:                 442   AIC:                             4794.
Df Residuals:                     431   BIC:                             4839.
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        152.1335      2.576     59.061      0.0

## Method 2: break down pieces

### Linear Algebra: coefficients and prediction

In [45]:
# use linear regression model from sklearn
lm = LinearRegression() 
lm.fit(X,y)
params = np.append(lm.intercept_,lm.coef_)
predictions = lm.predict(X)
print(params)
print(predictions[:10])

[ 152.13348416  -10.0098663  -239.81564367  519.84592005  324.3846455
 -792.17563855  476.73902101  101.04326794  177.06323767  751.27369956
   67.62669218]
[206.11667725  68.07103297 176.88279035 166.91445843 128.46225834
 106.35191443  73.89134662 118.85423042 158.80889721 213.58462442]


In [48]:
# use linear algebra
newX = np.append(np.ones((len(X),1)), X, axis=1)
ATA_inv = np.linalg.inv(np.dot(newX.T,newX))
ATb = np.dot(newX.T,y)
params = np.dot(ATA_inv,ATb)
predictions = np.dot(newX,params)
print(params)
print(predictions[:10])

[ 152.13348416  -10.0098663  -239.81564367  519.84592005  324.3846455
 -792.17563855  476.73902101  101.04326794  177.06323767  751.27369956
   67.62669218]
[206.11667725  68.07103297 176.88279035 166.91445843 128.46225834
 106.35191443  73.89134662 118.85423042 158.80889721 213.58462442]


### Statistics: standard errors, t-statistics, p-value

In [50]:
MSE = (sum((y-predictions)**2))/(len(newX)-len(newX[0]))

var_b = MSE*(np.linalg.inv(np.dot(newX.T,newX)).diagonal())
sd_b = np.sqrt(var_b)
ts_b = params/ sd_b

p_values =[2*(1-stats.t.cdf(np.abs(i),(len(newX)-len(newX[0])))) for i in ts_b]

sd_b = np.round(sd_b,3)
ts_b = np.round(ts_b,3)
p_values = np.round(p_values,3)
params = np.round(params,4)

myDF3 = pd.DataFrame()
myDF3["Coefficients"],myDF3["Standard Errors"],myDF3["t values"],myDF3["Probabilities"] = [params,sd_b,ts_b,p_values]
print(myDF3)


    Coefficients  Standard Errors  t values  Probabilities
0       152.1335            2.576    59.061          0.000
1       -10.0099           59.749    -0.168          0.867
2      -239.8156           61.222    -3.917          0.000
3       519.8459           66.533     7.813          0.000
4       324.3846           65.422     4.958          0.000
5      -792.1756          416.680    -1.901          0.058
6       476.7390          339.030     1.406          0.160
7       101.0433          212.531     0.475          0.635
8       177.0632          161.476     1.097          0.273
9       751.2737          171.900     4.370          0.000
10       67.6267           65.984     1.025          0.306
