In [None]:
import numpy as np
import matplotlib.pyplot as plt


def f(x):
    return x ** 2 - 4 * x + 6

def grad_f(x):
    return 2 * x - 4


def steepest_descent(func, grad_func, x0, learning_rate = 0.01, max_iter = 10, verbose = True):
    paths = []
    for i in range(max_iter):
        x1 = x0 - learning_rate * grad_func(x0)
        paths.append(x0)
        if verbose:
            print('{0:03d} : {1:4.3f}, {2:4.2E}'.format(i, x1, func(x1)))
        x0 = x1
        paths.append
    return (x0, func(x0), paths)


number_of_points = 101
x = np.linspace(-5., 5, number_of_points)
fx = f(x)

plt.plot(x, fx, label='$f(x)$')
plt.grid()
plt.xlabel('$x$')
plt.ylabel('$f(x)$')
plt.title('Function $f(x)$')
plt.legend()
plt.show()

xid = np.argmin(fx)
xopt = x[xid]

plt.plot(x, fx, label='$f(x)$')
plt.grid()
plt.xlabel('$x$')
plt.ylabel('$f(x)$')
plt.plot(xopt, f(xopt), 'ro', label='$x_{opt}$')
plt.show()

xopt, fopt, paths = steepest_descent(f, grad_f, 0.0, learning_rate=1.2)

x = np.linspace(0.5, 2.5, 1000)
paths = np.array(paths)
plt.plot(x, f(x), label='$f(x)$')
plt.grid()
plt.xlabel('$x$')
plt.ylabel('$f(x)$')
plt.title('Function $f(x)$')
plt.plot(paths, f(paths), 'o-')
plt.show()

plt.plot(f(paths), 'o-')
plt.grid()
plt.xlabel('$x$')
plt.ylabel('$Cost$')
plt.title('Cost function')
plt.show()

xopt, fopt, paths = steepest_descent(f, grad_f, 1.0, learning_rate=1.0)

x = np.linspace(0.5, 3.5, 1000)
paths = np.array(paths)
plt.plot(x, f(x), label='$f(x)$')
plt.grid()
plt.xlabel('$x$')
plt.ylabel('$f(x)$')
plt.title('Function $f(x)$')
plt.plot(paths, f(paths), 'o-')
plt.show()

plt.plot(f(paths), 'o-')
plt.grid()
plt.xlabel('$x$')
plt.ylabel('$Cost$')
plt.title('Cost function')
plt.show()

xopt, fopt, paths = steepest_descent(f, grad_f, 1.0, learning_rate=0.001)

x = np.linspace(0.5, 3.5, 1000)
paths = np.array(paths)
plt.plot(x, f(x), label='$f(x)$')
plt.grid()
plt.xlabel('$x$')
plt.ylabel('$f(x)$')
plt.title('Function $f(x)$')
plt.plot(paths, f(paths), 'o-')
plt.show()

plt.plot(f(paths), 'o-')
plt.grid()
plt.xlabel('$x$')
plt.ylabel('$Cost$')
plt.title('Cost function')
plt.show()

xopt, fopt, paths = steepest_descent(f, grad_f, 3.0, learning_rate=0.9)

x = np.linspace(0.5, 3.5, 1000)
paths = np.array(paths)
plt.plot(x, f(x), label='$f(x)$')
plt.grid()
plt.xlabel('$x$')
plt.ylabel('$f(x)$')
plt.title('Function $f(x)$')
plt.plot(paths, f(paths), 'o-')
plt.show()

plt.plot(f(paths), 'o-')
plt.grid()
plt.xlabel('$x$')
plt.ylabel('$Cost$')
plt.title('Cost function')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm
from visualize import contour_with_quiver
from visualize import contour_with_path
from visualize import surf

xmin, xmax, xstep = -4.0, 4.0, .25
ymin, ymax, ystep = -4.0, 4.0, .25

x, y = np.meshgrid(np.arange(xmin, xmax + xstep, xstep),
                   np.arange(ymin, ymax + ystep, ystep))

f = lambda x, y : (x - 2) ** 2 + (y - 2) ** 2
z = f(x, y)
minima = np.array([2., 2.])

f(*minima)

minima_ = minima.reshape(-1, 1)
print(minima, minima_)
surf(f, x, y, minima=minima_)

grad_f_x = lambda x, y : 2 * (x - 2)
grad_f_y = lambda x, y : 2 * (y - 2)

contour_with_quiver(f, x, y, grad_f_x, grad_f_y,minima=minima_)

def steepest_descent_twod(func, gradx, grady, x0, max_iteration = 10, learning_rate = 0.25, verbose = True):
    paths = [x0]
    fval_paths = [f(x0[0], x0[1])]
    for i in range(max_iteration):
        grad = np.array([grad_f_x(*x0), grad_f_y(*x0)])
        x1 = x0 - learning_rate * grad
        fval = f(*x1)
        if verbose:
            print(i, x1, fval)
        x0 = x1
        paths.append(x0)
        fval_paths.append(fval)
    paths = np.array(paths)
    paths = np.array(np.matrix(paths).T)
    fval_paths = np.array(fval_paths)
    return (x0, fval, paths, fval_paths)

x0 = np.array([-2., -2.])
xopt, fopt, paths, fval_paths = steepest_descent_twod(f, grad_f_x, grad_f_y, x0)

contour_with_path(f, x, y, paths=paths, minima=np.array([[2], [2]]))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
# matplotlib.use('TkAgg')

np.random.seed(320)
x_train = np.linspace(-1, 1, 51)
def f(x): return 0.5*x+1.0


y_train = f(x_train) + 0.4*np.random.rand(len(x_train))
plt.plot(x_train, y_train, 'o', label='Training data')
plt.grid()
plt.show()


np.random.seed(303)

shuffled_id = np.arange(0, len(x_train))
np.random.shuffle(shuffled_id)
x_train = x_train[shuffled_id]
y_train = y_train[shuffled_id]


def loss(w, x_set, y_set):
    N = len(x_set)
    val = 0.0
    for i in range(len(x_set)):
        val += 0.5 * (w[0] * x_set[i] + w[1] - y_set[i])**2
    return val/N


def loss_grad(w, x_set, y_set):
    N = len(x_set)
    val = np.zeros(len(w))
    for i in range(len(x_set)):
        er = w[0] * x_set[i] + w[1] - y_set[i]
        val += er * np.array([x_set[i], 1.0])
    return val/N


def generate_batches(batch_size, features, labels):
    assert len(features) == len(labels)
    outout_batches = []

    sample_size = len(features)
    for start_i in range(0, sample_size, batch_size):
        end_i = start_i + batch_size
        batch = [features[start_i:end_i], labels[start_i:end_i]]
        outout_batches.append(batch)
    return outout_batches


def draw(paths_1, paths_2, x_train, y_train, color_1, color_2, name_1, name_2):
    W0 = np.linspace(-2, 5, 101)
    W1 = np.linspace(-2, 5, 101)
    W0, W1 = np.meshgrid(W0, W1)
    LOSSW = W0 * 0
    for i in range(W0.shape[0]):
        for j in range(W0.shape[1]):
            wij = np.array([W0[i, j], W1[i, j]])
            LOSSW[i, j] = loss(wij, x_train, y_train)

    fig, ax = plt.subplots(figsize=(6, 6))

    ax.contour(W0, W1, LOSSW, cmap=plt.cm.jet,
               levels=np.linspace(0, max(LOSSW.flatten()), 20))
    paths = np.array(np.matrix(paths_1).T)
    ax.quiver(paths[0, :-1], paths[1, :-1], paths[0, 1:]-paths[0, :-1],
              paths[1, 1:]-paths[1, :-1],
              angles='xy', scale_units='xy', scale=1, color=color_1)
    paths = np.array(np.matrix(paths_2).T)
    ax.quiver(paths[0, :-1], paths[1, :-1], paths[0, 1:]-paths[0, :-1],
              paths[1, 1:]-paths[1, :-1],
              angles='xy', scale_units='xy', scale=1, color=color_2)
    plt.legend([name_1, name_2])
    plt.show()


# SGD
batch_size = 10
lr = 0.01
max_epoch = 51

# Momentum
alpha = .9

w0 = np.array([4.0, -1.0])
path_sgd = []
for epoch in range(max_epoch):
    if epoch % 10 == 0:
        print(epoch, w0, loss(w0, x_train, y_train))
    for x_batch, y_batch in generate_batches(batch_size, x_train, y_train):
        path_sgd.append(w0)
        grad = loss_grad(w0, x_batch, y_batch)
        w1 = w0 - lr * grad
        w0 = w1

w0 = np.array([4.0, -1.0])
path_momentum = []
v = np.zeros_like(w0)
for epoch in range(max_epoch):
    if epoch % 10 == 0:
        print(epoch, w0, loss(w0, x_train, y_train))
    for x_batch, y_batch in generate_batches(batch_size, x_train, y_train):
        path_momentum.append(w0)
        grad = loss_grad(w0, x_batch, y_batch)
        v = alpha * v - lr * grad
        w1 = w0 + v
        w0 = w1

draw(path_sgd, path_momentum, x_train, y_train,
     'k', 'r', 'SGD', 'Momentum')

# SGD
lr = 1.5

# Adagrad
epsilon = lr
delta = 1e-7

w0 = np.array([4.0, -1.0])
path_sgd = []
for epoch in range(max_epoch):
    if epoch % 10 == 0:
        print(epoch, w0, loss(w0, x_train, y_train))
    for x_batch, y_batch in generate_batches(batch_size, x_train, y_train):
        path_sgd.append(w0)
        grad = loss_grad(w0, x_batch, y_batch)
        w1 = w0 - lr * grad
        w0 = w1

w0 = np.array([4.0, -1.0])
path_adagrad = []
r = np.zeros_like(w0)
for epoch in range(max_epoch):
    if epoch % 10 == 0:
        print(epoch, w0, loss(w0, x_train, y_train))
    for x_batch, y_batch in generate_batches(batch_size, x_train, y_train):
        path_adagrad.append(w0)
        grad = loss_grad(w0, x_batch, y_batch)
        r += grad ** 2
        delw = -epsilon * grad / (np.sqrt(r) + delta)
        w1 = w0 + delw
        w0 = w1

draw(path_sgd, path_adagrad, x_train, y_train,
     'k', 'r', 'SGD', 'Adagrad')

# Adagrad
epsilon = 0.25
delta = 1e-6

# RMSprop
rho = 0.9

w0 = np.array([4.0, -1.0])
path_adagrad = []
r = np.zeros_like(w0)
for epoch in range(max_epoch):
    if epoch % 10 == 0:
        print(epoch, w0, loss(w0, x_train, y_train))
    for x_batch, y_batch in generate_batches(batch_size, x_train, y_train):
        path_adagrad.append(w0)
        grad = loss_grad(w0, x_batch, y_batch)
        r += grad ** 2
        delw = -epsilon * grad / (np.sqrt(r) + delta)
        w1 = w0 + delw
        w0 = w1

w0 = np.array([4.0, -1.0])
r = np.zeros_like(w0)
path_rmsprop = []
for epoch in range(max_epoch):
    if epoch % 10 == 0:
        print(epoch, w0, loss(w0, x_train, y_train))
    for x_batch, y_batch in generate_batches(batch_size, x_train, y_train):
        path_rmsprop.append(w0)
        grad = loss_grad(w0, x_batch, y_batch)
        r = rho * r + (1 - rho) * grad ** 2
        delw = -epsilon * grad / (np.sqrt(r) + delta)
        w1 = w0 + delw
        w0 = w1

draw(path_adagrad, path_rmsprop, x_train, y_train,
     'k', 'r', 'Adagrad', 'RMSprop')

epsilon = 0.1
delta = 1e-6

# Adam
delta_adam = 1e-8
rho1 = 0.9
rho2 = 0.999

w0 = np.array([4.0, -1.0])
r = np.zeros_like(w0)
path_rmsprop = []
for epoch in range(max_epoch):
    if epoch % 10 == 0:
        print(epoch, w0, loss(w0, x_train, y_train))
    for x_batch, y_batch in generate_batches(batch_size, x_train, y_train):
        path_rmsprop.append(w0)
        grad = loss_grad(w0, x_batch, y_batch)
        r = rho * r + (1 - rho) * grad ** 2
        delw = -epsilon * grad / (np.sqrt(r) + delta)
        w1 = w0 + delw
        w0 = w1

w0 = np.array([4.0, -1.0])
s = np.zeros_like(w0)
r = np.zeros_like(w0)
path_adam = []
t = 0

for epoch in range(max_epoch):
    if epoch % 10 == 0:
        print(epoch, w0, loss(w0, x_train, y_train))
    for x_batch, y_batch in generate_batches(batch_size, x_train, y_train):
        path_adam.append(w0)
        grad = loss_grad(w0, x_batch, y_batch)
        s = rho1 * s + (1 - rho1) * grad
        r = rho2 * r + (1 - rho2) * grad ** 2
        t += 1
        shat = s / (1. - rho1**t)
        rhat = r / (1. - rho2**t)
        delw = -epsilon * shat / (delta_adam + np.sqrt(rhat))
        w1 = w0 + delw
        w0 = w1

draw(path_rmsprop, path_adam, x_train, y_train,
     'k', 'r', 'RMSprop', 'Adam')
