# Linear Regression

In [1]:
from plotly.graph_objects import Figure, Scatter, Image, Heatmap, Surface
from plotly.express       import imshow
from plotly.subplots      import make_subplots
from math                 import sqrt, pi, log, exp
from numpy                import array, linspace, logspace, arange, eye, zeros, trace, zeros_like
import numpy              as np
from numpy.random         import random, randn, seed
from numpy.linalg         import inv, solve, det
from scipy.stats          import multivariate_normal
from scipy.optimize       import minimize
rand = random

## Model specification

In [2]:
# define the features for polynomials
# `x` must be a vector
def phi(x, d):
    return x.reshape(x.shape[0],1) ** arange(d).reshape(1,d)
phi(randn(5), 3)       # test

array([[ 1.        ,  2.21220551,  4.8938532 ],
       [ 1.        , -0.49828706,  0.24829   ],
       [ 1.        , -0.51575872,  0.26600706],
       [ 1.        , -0.5306504 ,  0.28158985],
       [ 1.        ,  2.10023289,  4.41097818]])

In [3]:
def f(x, w):
    d = w.shape[0]
    return phi(x, d) @ w
f(randn(5), randn(3))

array([-0.19910085, -0.22345963, -0.20407738, -0.40085203, -0.59566625])

In [4]:
def sample(w_true, n, noise_std=0.05):
    x = rand(n)                   # uniformly distributed locations
    y = f(x, w_true) + noise_std*randn(n)  # noisy measurements
    return x, y
sample(array([0.1, 0.2]), 5)

(array([0.84814429, 0.6362167 , 0.80912203, 0.86446342, 0.3028006 ]),
 array([0.3102733 , 0.17797421, 0.27526456, 0.32244527, 0.21160289]))

In [5]:
def linear_regression(x, y, d):
    X = phi(x, d)
    # maximum likelihood estimator
    #w_ml = inv(X.T @ X) @ X.T @ y
    # x = inv(A)@y
    # x = solve(A, y)
    w_ml = solve(X.T @ X, X.T @ y) # same as previous line
    return w_ml

In [6]:
# implement mean-squared error
def mse(x, y, w):
    return ((y-f(x,w))**2).mean()

## Toy example

In [28]:
# create some toy data
seed(100)     # seed(100) fits the story!
n_train = 20
sigma   = 0.2
w_true  = array([0.1, -2.5, 3.0])
print(f"w_true = {w_true}")
x_train, y_train = sample(w_true, n_train, sigma)
n_pred  = 50
x_pred  = linspace(0.0, 1.0, n_pred)
# show the results for different `d`
d_candidates = [2, 3, 4, 10]
fig = Figure()
fig.add_trace(Scatter(x=x_train, y=y_train, mode='markers', name='train'))
fig.add_trace(Scatter(x=x_pred, y=f(x_pred, w_true), mode='lines', name='true', line=dict(dash='dot')))
for d in d_candidates:
    w_ml    = linear_regression(x_train, y_train, d)
    y_pred  = f(x_pred, w_ml)
    fig.add_trace(Scatter(x=x_pred,  y=y_pred,  mode='lines', name=f'd={d}'))
    fig.update_layout(title='Linear regression', xaxis_title='x', yaxis_title='y')
fig.show()

w_true = [ 0.1 -2.5  3. ]


In [8]:
# 1st attempt: evaluating on the training to choose d
for d in range(1,20):
    w = linear_regression(x_train, y_train, d)
    print(f"d={d} MSE={mse(x_train, y_train, w)}")

d=1 MSE=0.09742401962079222
d=2 MSE=0.06118542380520238
d=3 MSE=0.028473223805358373
d=4 MSE=0.02834045503079169
d=5 MSE=0.02756703235622109
d=6 MSE=0.027524245255540213
d=7 MSE=0.02671174573186778
d=8 MSE=0.026706917870023966
d=9 MSE=0.023841757831569683
d=10 MSE=0.02290630415590059
d=11 MSE=0.02241895121941217
d=12 MSE=0.016550811979378813
d=13 MSE=0.016442908745360023
d=14 MSE=0.015897715074530454
d=15 MSE=0.01504846190006274
d=16 MSE=0.023460907040262123
d=17 MSE=0.014366628885646299
d=18 MSE=0.014426536181103861
d=19 MSE=0.01449900950645001


In [9]:
# 2nd attempt: evaluate on a evaluation set to choose d
n_eval = 100
x_eval, y_eval = sample(w_true, n_eval, sigma)
for d in range(1,10):
    w = linear_regression(x_train, y_train, d)
    print(f"d={d} MSE={mse(x_eval, y_eval, w)}")
d_best = 3   # chosen by looking at the numbers, for seed(100)

d=1 MSE=0.13999274282384647
d=2 MSE=0.10338869813720887
d=3 MSE=0.045577180338687205
d=4 MSE=0.04578727655757514
d=5 MSE=0.04588901870295339
d=6 MSE=0.04639759067573342
d=7 MSE=0.051333321451431664
d=8 MSE=0.051277214493647465
d=9 MSE=0.06965181778966137


In [10]:
# 3rd attempt: calculate the test error for the best d
n_test = 100
x_test, y_test = sample(w_true, n_test, sigma)
w = linear_regression(x_train, y_train, d_best)
print(f"d={d_best} MSE={mse(x_test, y_test, w)}")

d=3 MSE=0.05191382568119687


Observe that it is slightly larger than the evaluation error for $d=3$.

# Bayesian linear regression

In [11]:
# create some toy data
seed(0)          # set the seed for numpy
d = 2
n_train = 100
n_pred  = 50
sigma2  = 0.04
Sigma   = sigma2 * eye(n_train)   # measurement noise
w_true  = 2.0 * randn(d)            # true weights
x_train, y_train = sample(w_true, n_train, sqrt(sigma2))
x_pred  = linspace(0.0, 1.0, n_pred)

In [12]:
y_train.shape

(100,)

In [13]:
# prior for `w`
d  = 2
w0 = zeros(d)
tau2 = 1.0    # variance
V0 = tau2 * eye(d)

In [14]:
# posterior for `w` after seeing data points `x`, `y`
# `x` and `y` must have shape (n,)
def posterior_w(x, y, w0, V0, Sigma):
    d = w0.shape[0]
    X = phi(x, d)
    invV0    = inv(V0)
    invSigma = inv(Sigma)
    Vn = inv(X.T @ invSigma @ X + invV0)
    wn = Vn @ (invV0 @ w0 + X.T @ invSigma @ y)
    return wn, Vn
posterior_w(x_train, y_train, w0, V0, Sigma)

(array([3.56464937, 0.79513394]),
 array([[ 0.00145199, -0.00224126],
        [-0.00224126,  0.00477428]]))

## Implement negative log marginal likelihood
Why the negative one?  Because we would like to minimize it.

In [15]:
def neg_log_marginal_likelihood(y, x, d, theta):
    sigma2, tau2 = theta[0], theta[1]     # unpack
    n = y.shape[0]
    X = phi(x, d)
    A = sigma2*eye(n) + tau2* X @ X.T
    #return -0.5* y @ inv(A) @ y - 0.5*log(det(A)) - 0.5*n*log(2*pi)
    lml = -0.5* y @ solve(A, y) - 0.5*log(det(A)) - 0.5*n*log(2*pi)
    return -lml
nlml = lambda theta: neg_log_marginal_likelihood(y_train, x_train, d, theta)

In [16]:
# calculate the negative log marginal likelihood on a grid
d = 2
m = 20
sigma2s = logspace(-3, 2, m)
tau2s   = logspace(-3, 2, m)
nlml_values = array([[nlml(array([sigma2, tau2])) for sigma2 in sigma2s] for tau2 in tau2s])
fig = Figure(data=[Surface(z=nlml_values)])
fig.show()

In [17]:
imshow(np.log(10+nlml_values), 
       labels={'x':'index in sigma2s', 'y':'index in tau2s'})

In [18]:
sigma2s[6], tau2s[15], nlml(array([sigma2s[6],tau2s[15]]))

(0.0379269019073225, 8.858667904100823, -9.593776635421193)

## Implement derivative of log marginal likelihood

In [19]:
def grad_neg_log_marginal_likelihood(y, x, d, theta):
    sigma2, tau2 = theta[0], theta[1]     # unpack
    n = y.shape[0]
    y = y.reshape(n, 1)
    X = phi(x, d)
    A = sigma2*eye(n) + tau2* X @ X.T
    B = inv(A)
    BYYB  = -0.5 * (B @ y @ y.T @ B - B)
    grad0 = trace(BYYB)
    grad1 = trace(BYYB @ X @ X.T)
    return array([grad0, grad1])

In [20]:
def finite_diff(f, x, delta):
    """estimate the gradient by finite-differencing method"""
    grad_f, dx = zeros_like(x), zeros_like(x)
    for i in range(x.size):
        dx.flat[i]     = delta
        grad_f.flat[i] = f(x+dx) - f(x-dx)
        dx.flat[i]     = 0.0
    return grad_f / (2*delta)

In [21]:
grad_nlml = lambda theta: grad_neg_log_marginal_likelihood(y_train, x_train, d, theta)
theta = rand(2)
print(finite_diff(nlml, theta, 1e-5))
print(grad_nlml(theta))

[116.41782274 -28.28340633]
[116.41782272 -28.28340632]


## Implement gradient descent

In [22]:
def gradient_descent(f, grad_f, x0, learning_rate=0.1, maxiter=100):
    x  = x0
    for iter in range(maxiter):
        x = x - learning_rate * grad_f(x)
        print(f(x), x)
    return x

In [23]:
# test gradient_descent
g = lambda x: x**2
grad_g = lambda x: 2*x
gradient_descent(g, grad_g, 20.0, 0.1, 10)

256.0 16.0
163.84000000000003 12.8
104.8576 10.24
67.108864 8.192
42.94967296 6.5536
27.487790694400005 5.24288
17.592186044416007 4.194304000000001
11.258999068426245 3.3554432000000007
7.205759403792796 2.6843545600000005
4.61168601842739 2.1474836480000006


2.1474836480000006

## Apply gradient descent to marginal likelihood

In [24]:
d = 3
neg_log_marginal_likelihood(y_train, x_train, d, array([0.037, 8.8]))

-8.03151006246135

In [25]:
theta0 = array([0.1, 0.1])
# note we are using different learning rates for each parameter
theta1 = gradient_descent(nlml, grad_nlml, theta0, learning_rate=array([0.00001, 0.01]), maxiter = 1000)

5.930176135526821 [0.09721068 6.45512564]
5.072136493457322 [0.09428772 6.45448959]
4.200099748993665 [0.09134033 6.45385267]
3.315641897971986 [0.08837116 6.45321487]
2.420905532917075 [0.08538368 6.45257618]
1.5187300612864902 [0.0823824  6.45193659]
0.6128008333363226 [0.07937311 6.4512961 ]
-0.29218707540320565 [0.07636316 6.45065469]
-1.1903607996141403 [0.07336179 6.45001237]
-2.0745174745129162 [0.07038048 6.44936913]
-2.9360224195243774 [0.06743337 6.44872498]
-3.7648114255738676 [0.06453758 6.44807992]
-4.549572480584089 [0.0617135  6.44743398]
-5.278198937042205 [0.05898491 6.44678717]
-5.9386022587463 [0.05637863 6.44613953]
-6.519923646027095 [0.05392373 6.44549108]
-7.01406494435723 [0.05164991 6.44484189]
-7.417269538036123 [0.04958509 6.44419202]
-7.731285704681753 [0.04775224 6.44354153]
-7.963580467281886 [0.04616602 6.4428905 ]
-8.126293899585377 [0.04483006 6.44223902]
-8.234126652856418 [0.04373584 6.44158718]
-8.301862449660902 [0.04286374 6.44093506]
-8.3423671673

-8.400132819411823 [0.04029768 6.3212585 ]
-8.400172925856168 [0.04029767 6.32062518]
-8.400213018286564 [0.04029767 6.31999197]
-8.400253096719169 [0.04029767 6.31935886]
-8.400293161138606 [0.04029767 6.31872587]
-8.40033321153966 [0.04029767 6.31809299]
-8.400373247932023 [0.04029767 6.31746022]
-8.400413270302224 [0.04029767 6.31682756]
-8.400453278662312 [0.04029767 6.31619501]
-8.400493272997934 [0.04029767 6.31556257]
-8.400533253303735 [0.04029767 6.31493024]
-8.400573219591834 [0.04029767 6.31429803]
-8.400613171841556 [0.04029767 6.31366592]
-8.400653110069726 [0.04029767 6.31303393]
-8.400693034262545 [0.04029766 6.31240204]
-8.400732944421065 [0.04029766 6.31177027]
-8.40077284054506 [0.04029766 6.31113861]
-8.400812722623897 [0.04029766 6.31050706]
-8.400852590665451 [0.04029766 6.30987562]
-8.400892444656833 [0.04029766 6.30924429]
-8.400932284606185 [0.04029766 6.30861307]
-8.400972110504725 [0.04029766 6.30798197]
-8.401011922357569 [0.04029766 6.30735098]
-8.4010517201

-8.407760716028363 [0.04029752 6.19867507]
-8.407798023310391 [0.04029752 6.19806424]
-8.407835316198273 [0.04029752 6.19745354]
-8.407872594696173 [0.04029752 6.19684295]
-8.40790985879849 [0.04029752 6.19623247]
-8.407947108509816 [0.04029752 6.19562212]
-8.40798434382502 [0.04029752 6.19501188]
-8.408021564740807 [0.04029752 6.19440176]
-8.408058771259292 [0.04029752 6.19379176]
-8.408095963373455 [0.04029752 6.19318188]
-8.40813314109333 [0.04029752 6.19257211]
-8.408170304399292 [0.04029752 6.19196247]
-8.408207453307895 [0.04029751 6.19135294]
-8.408244587812263 [0.04029751 6.19074353]
-8.40828170789878 [0.04029751 6.19013424]
-8.40831881358119 [0.04029751 6.18952506]
-8.408355904856464 [0.04029751 6.18891601]
-8.408392981712737 [0.04029751 6.18830707]
-8.408430044154926 [0.04029751 6.18769825]
-8.408467092182903 [0.04029751 6.18708955]
-8.40850412579745 [0.04029751 6.18648097]
-8.408541144986572 [0.04029751 6.1858725 ]
-8.408578149762505 [0.04029751 6.18526416]
-8.40861514010819

-8.41476179896867 [0.04029737 6.08183448]
-8.414796275842946 [0.04029737 6.08124728]
-8.414830738094835 [0.04029737 6.0806602 ]
-8.414865185742869 [0.04029737 6.08007325]
-8.41489961876458 [0.04029737 6.07948642]
-8.414934037174064 [0.04029737 6.07889972]
-8.414968440969261 [0.04029737 6.07831314]
-8.41500283013987 [0.04029737 6.07772669]
-8.415037204701733 [0.04029737 6.07714036]
-8.415071564640286 [0.04029737 6.07655415]
-8.415105909955642 [0.04029737 6.07596808]
-8.415140240655262 [0.04029737 6.07538212]
-8.415174556730932 [0.04029736 6.07479629]
-8.415208858185153 [0.04029736 6.07421058]
-8.41524314501713 [0.04029736 6.073625  ]
-8.415277417228737 [0.04029736 6.07303955]
-8.4153116748097 [0.04029736 6.07245422]
-8.415345917770964 [0.04029736 6.07186901]
-8.415380146108006 [0.04029736 6.07128393]
-8.415414359816268 [0.04029736 6.07069897]
-8.415448558898461 [0.04029736 6.07011414]
-8.415482743359007 [0.04029736 6.06952944]
-8.415516913185357 [0.04029736 6.06894485]
-8.41555106838409

-8.421270966984494 [0.04029722 5.96859826]
-8.421302556547033 [0.04029722 5.96803618]
-8.42133413144478 [0.04029722 5.96747423]
-8.421365691668043 [0.04029722 5.96691242]
-8.42139723722569 [0.04029722 5.96635073]
-8.421428768113785 [0.04029722 5.96578917]
-8.421460284335424 [0.04029722 5.96522775]
-8.42149178588518 [0.04029722 5.96466645]
-8.421523272761448 [0.04029722 5.96410529]
-8.421554744986182 [0.04029722 5.96354425]
-8.421586202533021 [0.04029722 5.96298335]
-8.421617645415381 [0.04029722 5.96242258]
-8.421649073619648 [0.04029721 5.96186194]
-8.42168048716887 [0.04029721 5.96130143]
-8.421711886045998 [0.04029721 5.96074105]
-8.421743270258347 [0.04029721 5.9601808 ]
-8.421774639807367 [0.04029721 5.95962068]
-8.421805994685045 [0.04029721 5.95906069]
-8.421837334900758 [0.04029721 5.95850084]
-8.421868660444233 [0.04029721 5.95794111]
-8.421899971326937 [0.04029721 5.95738152]
-8.42193126754411 [0.04029721 5.95682206]
-8.421962549095753 [0.04029721 5.95626272]
-8.4219938159774

In [26]:
neg_log_marginal_likelihood(y_train, x_train, d, theta0)

66.29993876669016

In [27]:
neg_log_marginal_likelihood(y_train, x_train, d, theta1)

-8.427354809605077