In [7]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import numpy.linalg as la
import scipy as sp
import scipy.optimize as opt

In [8]:
%precision 3
np.set_printoptions(precision=3)

# Regularization with weight decay

In [5]:
with open('in.dta.txt') as f:
    data = np.array([list(map(float, s.split())) for s in f])
    x_in = data[:,0:2]
    y_in = data[:,2]

with open('out.dta.txt') as f:
    data = np.array([list(map(float, s.split())) for s in f])
    x_out = data[:,0:2]
    y_out = data[:,2]

In [22]:
def Phi(x):
    x1, x2 = x[0], x[1]
    return 1, x1, x2, x1**2, x2**2, x1*x2, np.abs(x1-x2), np.abs(x1+x2)

z_in = np.apply_along_axis(Phi, 1, x_in)
z_out = np.apply_along_axis(Phi, 1, x_out)

w_pinv = la.pinv(z_in) @ y_in

In [23]:
def err_class(w, z, y):
    g = np.sign(z @ w)
    errors = np.abs(np.sign(g - y))
    return np.mean(errors)

def err_inout(w):
    ec_in = err_class(w, z_in, y_in)
    ec_out = err_class(w, z_out, y_out)
    return ec_in, ec_out

err_inout(w_pinv)

(0.029, 0.084)

In [37]:
NOGRAD = True

def err_sq(w, nograd=False):
    n = len(y_in)
    err = la.norm(z_in @ w - y_in) / n
    if nograd:
        return err
    grad = (z_in @ w - y_in) @ z_in / n
    return err, grad

err_sq(w_pinv, True)

0.066

In [38]:
def err_sq_decay(w, k, nograd=False):
    l = 10 ** k
    n = len(y_in)
    err, grad = err_sq(w)
    err_dec = la.norm(w) * l / n
    if nograd:
        return err + err_dec
    grad_dec = w * l / n
    grad_dec[0] = 0
    return err + err_dec, grad + grad_dec

err_sq_decay(w_pinv, 3, True)

166.110

### No Decay

In [49]:
w0 = np.zeros(w_pinv.shape)
res1 = opt.minimize(err_sq, w0, args=(), jac=True,
                    method='BFGS', options=dict(disp=True))
*err_inout(res1.x), res1.x, w_pinv

Optimization terminated successfully.
         Current function value: 0.066375
         Iterations: 57
         Function evaluations: 71
         Gradient evaluations: 71


(0.029,
 0.084,
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.481,  4.159,  0.317]),
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.482,  4.159,  0.317]))

In [48]:
w0 = np.zeros(w_pinv.shape)
res1 = opt.minimize(err_sq, w0, args=(NOGRAD,), jac=False,
                    method='Nelder-Mead',
                    options=dict(disp=True, fatol=1e-9, xatol=1e-9,
                                 maxiter=5000, maxfev=5000))
*err_inout(res1.x), res1.x, w_pinv

Optimization terminated successfully.
         Current function value: 0.072364
         Iterations: 2419
         Function evaluations: 3595


(0.057,
 0.072,
 array([-1.719, -0.101,  0.05 , -2.055, -1.875, -0.106,  3.202,  1.42 ]),
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.482,  4.159,  0.317]))

### Small Decay

In [82]:
res1 = opt.minimize(err_sq_decay, w0, args=(-3,), jac=True,
                    method='BFGS', options=dict(disp=True, maxiter=10000))
*err_inout(res1.x), res1.x, w_pinv

         Current function value: 0.066541
         Iterations: 57
         Function evaluations: 158
         Gradient evaluations: 146


(0.029,
 0.080,
 array([-1.643, -0.143,  0.102, -2.025, -1.816,  2.459,  4.142,  0.318]),
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.482,  4.159,  0.317]))

In [77]:
res1 = opt.minimize(err_sq_decay, w0, args=(-3,NOGRAD), jac=False,
                    method='Nelder-Mead',
                    options=dict(disp=True, fatol=1e-9, xatol=1e-9,
                                 maxiter=5000, maxfev=5000))
*err_inout(res1.x), res1.x, w_pinv

Optimization terminated successfully.
         Current function value: 0.066989
         Iterations: 2295
         Function evaluations: 3414


(0.029,
 0.064,
 array([-1.697, -0.142,  0.089, -2.1  , -1.963,  1.824,  3.995,  0.662]),
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.482,  4.159,  0.317]))

### Big Decay

In [89]:
res2 = opt.minimize(err_sq_decay, w0, args=(3,), jac=True,
                    method='Newton-CG',
                    options=dict(disp=True, maxiter=10000, xtol=1e-8))
*err_inout(res2.x), res2.x, w_pinv

         Current function value: 0.169031
         Iterations: 0
         Function evaluations: 18
         Gradient evaluations: 8
         Hessian evaluations: 0


(1.000,
 1.000,
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.482,  4.159,  0.317]))

In [90]:
res3 = opt.minimize(err_sq_decay, w0, args=(3,NOGRAD), jac=False,
                    method='Nelder-Mead',
                    options=dict(disp=True, fatol=1e-8, xatol=1e-8,
                                 maxiter=5000, maxfev=5000))
w_dec3 = res3.x
*err_inout(res3.x), res3.x, w_pinv

Optimization terminated successfully.
         Current function value: 0.169031
         Iterations: 343
         Function evaluations: 512


(1.000,
 1.000,
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.482,  4.159,  0.317]))

In [95]:
def my_opt(k=-9, w=None, eta=0.01, tol=1e-9, max_iter=8000):
    res = type('result', (object,), {})()
    res.k = k
    res.eta = eta
    res.tol = tol
    res.max_iter = max_iter
    res.ok = False
    res.hist = []

    w = w or w0
    err = eprev = np.Inf

    while len(res.hist) < max_iter:
        err, grad = err_sq_decay(w, k)
        res.hist.append(err)
        if np.abs(err - eprev) < tol:
            res.ok = True
            break
        w = w - eta * grad
        eprev = err

    res.w = w
    res.err = err
    res.niter = len(res.hist)
    res.erin, res.erout = err_inout(w)
    return res

In [96]:
res4 = my_opt(k=3)
res4.erin, res4.erout, res4.ok, res4.err, res4.niter, res4.w

(0.429,
 0.472,
 True,
 4.028,
 1807,
 array([ 0.134, -0.002,  0.002,  0.002,  0.003, -0.008,  0.015, -0.007]))

In [112]:
for k in (3,2,1,0,-1,-2):
    res = opt.minimize(err_sq_decay, w0, args=(k,), jac=True,
                       method='BFGS', options=dict(disp=False))
    print('k:%2d  erout:%.3f  status:%s ni:%s' % (k, err_inout(res.x)[1], res.status, res.nit))

k: 3  erout:1.000  status:2 ni:0
k: 2  erout:1.000  status:2 ni:0
k: 1  erout:1.000  status:2 ni:0
k: 0  erout:0.136  status:2 ni:9
k:-1  erout:0.056  status:2 ni:47
k:-2  erout:0.080  status:2 ni:57


In [116]:
for k in (3,2,1,0,-1,-2):
    res = my_opt(k=k, eta=0.01, max_iter=200000)
    print('k:{k:2.0f}  erout:{erout:.3f}  ok:{ok:d} ni:{niter}'.format(**res.__dict__))

k: 3  erout:0.472  ok:1 ni:1807
k: 2  erout:0.376  ok:1 ni:1782
k: 1  erout:0.112  ok:1 ni:6185
k: 0  erout:0.132  ok:1 ni:777
k:-1  erout:0.056  ok:1 ni:78168
k:-2  erout:0.080  ok:1 ni:104762


## Poblem #10

In [19]:
def nn_weights(layer1_size, input_nodes=10, output_nodes=1, hidden_nodes=36):
    zero_nodes = 2
    layer2_size = hidden_nodes - layer1_size - zero_nodes
    return input_nodes*layer1_size+(layer1_size+1)*layer2_size+(layer2_size+1)*output_nodes
max(nn_weights(x) for x in range(99))

510

In [22]:
res = opt.minimize(lambda x: -nn_weights(x), 1)
nn_weights(res.x[0])

510.000