In [3]:
%load_ext autoreload
%autoreload 2

import sys
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from IPython.display import display, HTML

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_rows',300)
pd.set_option('display.max_columns',50)
pd.set_option('display.max_colwidth',80)
pd.set_option('display.precision',4)
%matplotlib inline

## Linear Regression
Underlying model: $y(n)=C+\sum w_ix_i(n)+e(n)$, where
- $c$ is a constant, $w_i$ are coefficients for $x_i$, they are to be estimated
- $e(n)$ is noise components are $n$th instance
- N is total number of samples, K is model complexity
- minimize cost function $\min\sum_i (\hat y(n) - y(n))^2$, which is a Maximum Likelihood estimator under additive Gaussian white noise

In [4]:
# simulated data model
complexity, C, N = 3, 0.5, 15

w=np.random.randn(complexity)*2
x=np.random.randn(N, complexity)
y=x.dot(w)
sigma = 0.3
e=sigma*np.random.randn(*y.shape)
y=y+e+C
print(f"C={C}")
print(f"w={w}")
print(f"sigma={sigma}")

C=0.5
w=[-0.58054175  1.37679722  1.27579791]
sigma=0.3


## Linear regression or least squares estimators

In [6]:
# numpy least squares
from numpy import linalg
print("numpy.linalg.lstsq")
# X_stack = np.hstack([np.ones((x.shape[0], 1)), x])
X_stack = np.concatenate([np.ones((x.shape[0], 1)), x], axis=1)
model = linalg.lstsq(X_stack, y, rcond=None)
print(model[0], "\n")

# scipy optimize
from scipy.optimize import lsq_linear
print("scipy.optimize.lsq_linear")
model = lsq_linear(X_stack, y)
print(model.x, "\n")

# scikit learn linear regressor
print("scikit-learn LinearRegressor")
model = LinearRegression().fit(x, y)
model.intercept_
model.coef_
model.score(x, y)

# statsmodels OLS solver
print("statsmodels OLS solver")
X = sm.add_constant(x)
model = sm.OLS(y, X).fit()
model.params
summary=model.summary()
HTML(summary.tables[0].as_html())
HTML(summary.tables[1].as_html())

numpy.linalg.lstsq
[ 0.59756759 -0.42428839  1.3495217   1.26195729] 

scipy.optimize.lsq_linear
[ 0.59756759 -0.42428839  1.3495217   1.26195729] 

scikit-learn LinearRegressor


0.5975675885841535

array([-0.42428839,  1.3495217 ,  1.26195729])

0.9774602640345035

statsmodels OLS solver


array([ 0.59756759, -0.42428839,  1.3495217 ,  1.26195729])

  "anyway, n=%i" % int(n))


0,1,2,3
Dep. Variable:,y,R-squared:,0.977
Model:,OLS,Adj. R-squared:,0.971
Method:,Least Squares,F-statistic:,159.0
Date:,"Sun, 10 Nov 2019",Prob (F-statistic):,2.44e-09
Time:,23:19:08,Log-Likelihood:,0.75902
No. Observations:,15,AIC:,6.482
Df Residuals:,11,BIC:,9.314
Df Model:,3,,
Covariance Type:,nonrobust,,


0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,0.5976,0.075,7.945,0.000,0.432,0.763
x1,-0.4243,0.076,-5.618,0.000,-0.591,-0.258
x2,1.3495,0.073,18.558,0.000,1.189,1.510
x3,1.2620,0.076,16.582,0.000,1.094,1.429


## Understand OLS summary
### Goodness of fit $R^2$: 
The variance of dependent variable $y(n)$ (sum of squares total **SST**), can be partition into two terms: sum of squares regression **SSR** and sum of squares error **SSE**. Denote y estimates $\hat y(n)=w^tX(n)$, y mean $\bar y=\frac{1}{N}\sum_n^N y(n)$ <br>
$$\begin{align}
SST=&\sum_i^N (y-\bar y)^2 & \\
SSR=&\sum_i^N(w^TX-\bar y)^2 & \\
SSE=&\sum_{i}^{N}(y-\hat y)^2 & \\
\end{align}$$<br>
The coefficient of determination $R^2$, is defined as ration of SSR over SST. $R^2$ ranges [0, 1], 1 being a perfect fit. <br>
$$\begin{align}
R^2=\frac{SSR}{SST}=1-\frac{SSE}{SST}
\end{align}$$

### F-test
F-test evaluates the significance of the entire regression, where the null hypothesis is that all the regressors (coefficients) except yhe constant are 0, i.e., $w_i=0,\ \forall i$. <br>
F-statistics under the null hypothesis follows F-distribution with (K-1, N-k) degree of freedom
$$F=\frac{SSR/(K-1)}{SSE/(N-K)}\approx F(\frac{SSR}{SSE})_{K-1,N-K}$$
Rewrite F-statistic in terms of $R^2$, notice for fixed $R^2$, F-statistics decreases with increasing number of predictor $K$.
$$F=\frac{(N-K)R^2}{(K-1)(1-R^2)}$$

### Invidual test statistics (t-test)
Determine whether a weight/regressor is statistically significant. Each coefficient estimate, follows t-distribution.  
The t-statistics for kth regression coefficient under $H_0$ that $x_k$ and $y$ are independent follow t-distribution with n-K degrees of freedom.