In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import numpy.linalg as la
import scipy as sp
import scipy.optimize as opt
import numba

In [3]:
%precision 3
np.set_printoptions(precision=3)

# Regularization with weight decay

In [4]:
with open('in.dta.txt') as f:
    data = np.array([list(map(float, s.split())) for s in f])
    x_in = data[:,0:2]
    y_in = data[:,2]

with open('out.dta.txt') as f:
    data = np.array([list(map(float, s.split())) for s in f])
    x_out = data[:,0:2]
    y_out = data[:,2]

In [5]:
def Phi(x):
    x1, x2 = x[0], x[1]
    return 1, x1, x2, x1**2, x2**2, x1*x2, np.abs(x1-x2), np.abs(x1+x2)

z_in = np.apply_along_axis(Phi, 1, x_in)
z_out = np.apply_along_axis(Phi, 1, x_out)

w_pinv = la.pinv(z_in) @ y_in

In [6]:
@numba.jit
def err_class(w, z, y):
    g = np.sign(z @ w)
    errors = np.abs(np.sign(g - y))
    return np.mean(errors)

@numba.jit
def err_inout(w):
    ec_in = err_class(w, z_in, y_in)
    ec_out = err_class(w, z_out, y_out)
    return ec_in, ec_out

err_inout(w_pinv)

(0.029, 0.084)

In [7]:
@numba.njit
def err_sq_nograd(w):
    return err_sq_grad(w)[0]

@numba.njit
def err_sq_grad(w):
    n = len(y_in)
    err = la.norm(z_in @ w - y_in) / n
    grad = (z_in @ w - y_in) @ z_in / n
    return err, grad

err_sq_grad(w_pinv)

(0.066, array([ -8.755e-16,  -1.936e-16,  -4.316e-16,  -5.457e-16,  -4.383e-16,
         -1.071e-16,  -6.657e-16,  -8.953e-16]))

## No Decay
#### Task 2

In [8]:
w0 = np.zeros(w_pinv.shape)
res1 = opt.minimize(err_sq_grad, w0, args=(), jac=True,
                    method='BFGS', options=dict(disp=True))
*err_inout(res1.x), res1.x, w_pinv

Optimization terminated successfully.
         Current function value: 0.066375
         Iterations: 57
         Function evaluations: 71
         Gradient evaluations: 71


(0.029,
 0.084,
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.481,  4.159,  0.317]),
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.482,  4.159,  0.317]))

In [9]:
w0 = np.zeros(w_pinv.shape)
res1 = opt.minimize(err_sq_nograd, w0, args=(), jac=False,
                    method='Nelder-Mead',
                    options=dict(disp=True, fatol=1e-9, xatol=1e-9,
                                 maxiter=5000, maxfev=5000))
*err_inout(res1.x), res1.x, w_pinv

Optimization terminated successfully.
         Current function value: 0.072364
         Iterations: 2416
         Function evaluations: 3600


(0.057,
 0.072,
 array([-1.719, -0.101,  0.05 , -2.055, -1.875, -0.106,  3.202,  1.42 ]),
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.482,  4.159,  0.317]))

## Small Decay
#### Task 3

In [10]:
@numba.njit
def err_sq_decay_nograd(w, k):
    return err_sq_decay_grad(w, k)[0]

@numba.njit
def err_sq_decay_grad(w, k):
    l = 10. ** k
    n = len(y_in)
    err = la.norm(z_in @ w - y_in) / n
    err_dec = la.norm(w) * l / n
    grad = (z_in @ w - y_in) @ z_in / n
    grad_dec = w * l / n
    grad_dec[0] = 0
    return err + err_dec, grad + grad_dec

err_sq_decay_nograd(w_pinv, 3)

166.110

In [11]:
res1 = opt.minimize(err_sq_decay_grad, w0, args=(-3,), jac=True,
                    method='BFGS', options=dict(disp=True, maxiter=10000))
*err_inout(res1.x), res1.x, w_pinv

         Current function value: 0.066541
         Iterations: 57
         Function evaluations: 143
         Gradient evaluations: 131


(0.029,
 0.080,
 array([-1.643, -0.143,  0.102, -2.025, -1.816,  2.459,  4.142,  0.318]),
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.482,  4.159,  0.317]))

In [12]:
res1 = opt.minimize(err_sq_decay_nograd, w0, args=(-3,), jac=False,
                    method='Nelder-Mead',
                    options=dict(disp=True, fatol=1e-9, xatol=1e-9,
                                 maxiter=5000, maxfev=5000))
*err_inout(res1.x), res1.x, w_pinv

Optimization terminated successfully.
         Current function value: 0.066989
         Iterations: 2293
         Function evaluations: 3410


(0.029,
 0.064,
 array([-1.697, -0.142,  0.089, -2.1  , -1.963,  1.824,  3.995,  0.662]),
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.482,  4.159,  0.317]))

## Big Decay
#### Task 4

In [13]:
res2 = opt.minimize(err_sq_decay_grad, w0, args=(3,), jac=True,
                    method='Newton-CG',
                    options=dict(disp=True, maxiter=10000, xtol=1e-8))
*err_inout(res2.x), res2.x, w_pinv

         Current function value: 0.169031
         Iterations: 0
         Function evaluations: 18
         Gradient evaluations: 8
         Hessian evaluations: 0


(1.000,
 1.000,
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.482,  4.159,  0.317]))

In [14]:
res3 = opt.minimize(err_sq_decay_nograd, w0, args=(3,), jac=False,
                    method='Nelder-Mead',
                    options=dict(disp=True, fatol=1e-8, xatol=1e-8,
                                 maxiter=5000, maxfev=5000))
w_dec3 = res3.x
*err_inout(res3.x), res3.x, w_pinv

Optimization terminated successfully.
         Current function value: 0.169031
         Iterations: 343
         Function evaluations: 512


(1.000,
 1.000,
 array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 array([-1.647, -0.145,  0.102, -2.033, -1.828,  2.482,  4.159,  0.317]))

In [15]:
@numba.njit
def __my_opt(k, eta, tol, max_iter, w, result):
    err = eprev = np.Inf
    ok = False
    niter = 0
    while niter < max_iter:
        niter += 1
        err, grad = err_sq_decay_grad(w, k)
        if np.abs(err - eprev) < tol:
            ok = True
            break
        w = w - eta * grad
        eprev = err
    result[0] = ok
    result[1] = niter
    return w

def my_opt(k, eta = 0.01, tol = 1e-9, max_iter = 120000):
    result = np.array([0, 0])
    w = __my_opt(k, eta, tol, max_iter, w0.copy(), result)
    ok, niter = bool(result[0]), int(result[1])
    erin, erout = err_inout(w)
    return dict(k=k, eta=eta, tol=tol, max_iter=max_iter,
                w=w, ok=ok, niter=niter, erin=erin, erout=erout)

In [16]:
my_opt(k=3)

{'erin': 0.429,
 'erout': 0.472,
 'eta': 0.010,
 'k': 3,
 'max_iter': 120000,
 'niter': 1807,
 'ok': True,
 'tol': 0.000,
 'w': array([ 0.134, -0.002,  0.002,  0.002,  0.003, -0.008,  0.015, -0.007])}

### Task 5,6

In [22]:
for k in (3,2,1,0,-1,-2):
    res = my_opt(k)
    print('k:{k:2.0f}  erout:{erout:.3f}  ok:{ok:d} ni:{niter:6d}  w:{w}'.format(**res))

k: 3  erout:0.472  ok:1 ni:  1807  w:[ 0.134 -0.002  0.002  0.002  0.003 -0.008  0.015 -0.007]
k: 2  erout:0.376  ok:1 ni:  1782  w:[ 0.062 -0.012  0.019  0.015  0.03  -0.071  0.129 -0.062]
k: 1  erout:0.112  ok:1 ni:  6185  w:[-0.291  0.019  0.083  0.016  0.156 -0.28   0.62  -0.204]
k: 0  erout:0.132  ok:1 ni:   777  w:[-0.349  0.077  0.138 -0.05   0.21  -0.387  0.851 -0.37 ]
k:-1  erout:0.056  ok:1 ni: 78168  w:[-1.405 -0.039  0.095 -1.48  -1.108  1.065  3.038  0.404]
k:-2  erout:0.080  ok:1 ni:104762  w:[-1.619 -0.128  0.1   -1.955 -1.731  2.167  3.96   0.381]


In [18]:
%timeit my_opt(-2)

1.26 s ± 322 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [19]:
for k in (3,2,1,0,-1,-2):
    res = opt.minimize(err_sq_decay_grad, w0, args=(k,), jac=True,
                       method='BFGS', options=dict(disp=False))
    print('k:%2d  erout:%.3f  status:%s ni:%s' % (k, err_inout(res.x)[1], res.status, res.nit))

k: 3  erout:1.000  status:2 ni:0
k: 2  erout:1.000  status:2 ni:0
k: 1  erout:1.000  status:2 ni:0
k: 0  erout:0.136  status:2 ni:9
k:-1  erout:0.056  status:2 ni:47
k:-2  erout:0.080  status:2 ni:57


## Poblem #10

In [20]:
def nn_weights(layer1_size, input_nodes=10, output_nodes=1, hidden_nodes=36):
    zero_nodes = 2
    layer2_size = hidden_nodes - layer1_size - zero_nodes
    return input_nodes*layer1_size+(layer1_size+1)*layer2_size+(layer2_size+1)*output_nodes
max(nn_weights(x) for x in range(99))

510

In [21]:
res = opt.minimize(lambda x: -nn_weights(x), 1)
nn_weights(res.x[0])

510.000