In [1]:
import matplotlib.pyplot as plt
import numpy as np

from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import LogNorm
from matplotlib import animation
from IPython.display import HTML

from scipy.optimize import minimize
from collections import defaultdict
from itertools import zip_longest
from functools import partial

import torch
from torch.autograd import Variable


In [2]:
def f_t(x):
    c = np.random.choice([1010,-10], p=[0.01,0.99])
    return x*float(c)

def f_t_mod(x,t):
    if(t%101 == 1):
        c=1010
    else:
        c=-10
    return x*float(c)

# Beale's function
f  = lambda x, y: (1.5 - x + x*y)**2 + (2.25 - x + x*y**2)**2 + (2.625 - x + x*y**3)**2
# f = lambda x,y: (1010*x+1010*y)*0.01 - (10*x+10*y)*0.99 ## deterministic 2d version of function in paper


In [3]:
def regret(loss_sum,min_loss_sum,t):
    return (loss_sum - min_loss_sum)/t
    

In [4]:
l_r = lambda x: (1e-3)/np.sqrt(x)

In [None]:
from amsgrad import Amsgrad

t=1

total_loss_amsgrad = 0
total_min_loss_amsgrad = 0

total_loss_adam = 0
total_min_loss_adam = 0

x_var_adam = 0
x_var_amsgrad = 0

x_var_adam = Variable(torch.FloatTensor([-.3]), requires_grad=True)
adam = torch.optim.Adam([x_var_adam], betas=(0.9,0.99), lr=l_r(1))

x_var_amsgrad = Variable(torch.FloatTensor([-0.3]), requires_grad=True)
amsgrad = Amsgrad([x_var_amsgrad], betas=(0,0.99) , lr=l_r(1))


x_var_adam_hist = [x_var_adam.data[0]]
x_var_amsgrad_hist = [x_var_amsgrad.data[0]]


regret_adam_hist=[]
regret_amsgrad_hist=[]

iters = 600000

def closure_adam(): 
    adam.zero_grad()
    loss = f_t_mod(x_var_adam,t)
    loss.backward()
    global total_loss_adam
    total_loss_adam += loss.data[0]
    global total_min_loss_adam 
    total_min_loss_adam += -1*(1010 if t%101==1 else -10)
    return loss

def closure_amsgrad(): 
    amsgrad.zero_grad()
    loss = f_t_mod(x_var_amsgrad,t)
    loss.backward()
    global total_loss_amsgrad
    total_loss_amsgrad += loss.data[0]
    global total_min_loss_amsgrad
    total_min_loss_amsgrad += -1*(1010 if t%101==1 else -10)
#     print("total loss: ", total_loss_amsgrad)
#     print("min loss: ", total_min_loss_amsgrad)
#     print("t: ", t)
#     print("regret: ", regret(total_loss_amsgrad,total_min_loss_amsgrad,t))
    return loss

t=1
for i in range(iters):
    
    #zero the gradients
    amsgrad.zero_grad()
    adam.zero_grad()
    
    #Perform an optimization step
    adam.step(closure_adam)
    amsgrad.step(closure_amsgrad)
    
    #Clamp the variables between -1 and 1
    x_var_adam.data = x_var_adam.data.clamp(-1,1)
    x_var_amsgrad.data = x_var_amsgrad.data.clamp(-1,1)
    
    #Calculate the regret and store it
    adam_regret = regret(total_loss_adam,total_min_loss_adam,t)
    regret_adam_hist.append(adam_regret)
    
    ams_regret = regret(total_loss_amsgrad,total_min_loss_amsgrad,t)
    regret_amsgrad_hist.append(ams_regret)

    #Store the x_t values
    x_var_adam_hist.append(x_var_adam.data[0])
    x_var_amsgrad_hist.append(x_var_amsgrad.data[0])
    
    t+=1
t=1
    
x = list(range(0,iters))
plt.clf()
plt.xlabel("Iterations")
plt.ylabel("$R_t/t$")
plt.plot(x,regret_adam_hist, label="adam", c='b', ls='--')
plt.plot(x,regret_amsgrad_hist,label="amsgrad",c='g')
plt.axis([0, iters, 0, 3])
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
plt.legend(loc='best')
plt.show()

plt.clf()


x = list(range(0,iters+1))
plt.plot(x, x_var_adam_hist, label="adam", c='b', ls='--')
plt.xlabel("Iterations")
plt.ylabel("$x_t$")
plt.plot(x, x_var_amsgrad_hist, label="amsgrad", c='g')
plt.ticklabel_format(style='sci', axis='x', scilimits=(0,0))
plt.legend(loc='best')
plt.show()

print(x_var_amsgrad.data[0])
print(x_var_adam.data[0])

    

In [None]:
t=1
x_var_adam = Variable(torch.FloatTensor([1]), requires_grad=True)
adam = torch.optim.Adam([x_var_adam], betas=(0,1/(1+1010**2)), lr=l_r(t), eps=0)

x_var_amsgrad = Variable(torch.FloatTensor([1]), requires_grad=True)
amsgrad = Amsgrad([x_var_amsgrad], eps=0, lr=l_r(t), betas=(0,1/(1+1010**2)))

x_var_adam_hist = [x_var_adam.data[0]]
x_var_amsgrad_hist = [x_var_amsgrad.data[0]]

iters = 6000000


t=1
for i in range(iters):
    amsgrad.zero_grad()
    adam.zero_grad()
    adam.step(closure_adam)
    amsgrad.step(closure_amsgrad)
    x_var_adam.data = x_var_adam.data.clamp(-1,1)
    x_var_amsgrad.data = x_var_amsgrad.data.clamp(-1,1)
    
    
    x_var_adam_hist.append(x_var_adam.data[0])
    x_var_amsgrad_hist.append(x_var_amsgrad.data[0])
    t+=1
t=1
    
x = list(range(0,iters+1))
plt.clf()
plt.plot(x, x_var_adam_hist, label="adam", c='r')
plt.plot(x, x_var_amsgrad_hist, label="amsgrad", c='b')
plt.legend(loc='best')
plt.show()


    

In [None]:
xmin, xmax, xstep = -4.5, 4.5, .2
ymin, ymax, ystep = -4.5, 4.5, .2

x, y = np.meshgrid(np.arange(xmin, xmax + xstep, xstep), np.arange(ymin, ymax + ystep, ystep))

def getMinima(x,y):
    minima = np.array([float(x),float(y)])
    minima = minima.reshape(-1,1)
    return minima

In [None]:
x, y = np.meshgrid(np.arange(xmin, xmax + xstep, xstep), np.arange(ymin, ymax + ystep, ystep))
z = f(x,y)

In [None]:
minima = getMinima(3,0.5)
fig = plt.figure(figsize=(8, 5))
ax = plt.axes(projection='3d', elev=50, azim=-50)

ax.plot_surface(x, y, z, norm=LogNorm(), rstride=1, cstride=1, 
                edgecolor='none', alpha=.8, cmap=plt.cm.jet)
ax.plot(*minima, f(*minima), 'r*', markersize=10)

ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_zlabel('$z$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))

plt.show()

In [None]:
x0 = np.array([3., 4.])

In [None]:
w_adam = Variable(torch.FloatTensor(x0), requires_grad=True)
adam = torch.optim.Adam([w_adam], lr = 1e-3)

In [None]:
def var_ft(x,y):
    c = np.random.choice([1010,-10], p=[0.01,0.99])
    c = np.array([float(c)])
    c = Variable(torch.FloatTensor(c))
    return (c*x + c*y)

def beales_var(var):
    x = var[0]
    y = var[1]
    
    return ((1.5 - x + x*y).pow(2) + (2.25 - x + (x*y).pow(2)).pow(2) + (2.625 - x + (x*y).pow(3)).pow(2))
#     return var_ft(x,y)

In [None]:
def get_parameterized_closure(optimizer,f,w):
    def closure(): 
        optimizer.zero_grad()
        loss = f(w)
        loss.backward()
        return loss
    return closure

In [None]:
def optimize(optim, f, w, steps, length):
    closure = get_parameterized_closure(optim,f,w)
    path = np.ndarray(shape=(steps,2),dtype=float)
    for step in range(steps):
        optim.step(closure)
        path[step][0] = w[0].data[0]
        path[step][1] = w[1].data[0]
    result = np.ndarray(shape=(length,2),dtype=float)
    idx = 0
    for n in np.linspace(0,steps-1,num=length, dtype=int):
        result[idx] = path[n]
        idx += 1
    return result

In [None]:
path = optimize(adam,beales_var,w_adam,10000, 100).T

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

ax.contour(x, y, z, levels=np.logspace(0, 5, 35), norm=LogNorm(), cmap=plt.cm.jet)
ax.quiver(path[0,:-1], path[1,:-1], path[0,1:]-path[0,:-1], path[1,1:]-path[1,:-1], scale_units='xy', angles='xy', scale=1, color='k')
ax.plot(*minima, 'r*', markersize=18)

ax.set_xlabel('$x$')
ax.set_ylabel('$y$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

ax.contour(x, y, z, levels=np.logspace(0, 5, 35), norm=LogNorm(), cmap=plt.cm.jet)
ax.plot(*minima, 'r*', markersize=18)

line, = ax.plot([], [], 'b', label='amsgrad', lw=2)
point, = ax.plot([], [], 'bo')

ax.set_xlabel('$x$')
ax.set_ylabel('$y$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))

ax.legend(loc='upper left')

In [None]:
def init():
    line.set_data([], [])
    point.set_data([], [])
    return line, point

In [None]:
def animate(i):
    line.set_data(*path[::,:i])
    point.set_data(*path[::,i-1:i])
    return line, point

In [None]:
anim = animation.FuncAnimation(fig, animate, init_func=init,
                               frames=path.shape[1], interval=60, 
                               repeat_delay=5, blit=True)

In [None]:
HTML(anim.to_html5_video())

In [None]:
fig_3d = plt.figure(figsize=(8, 5))
ax = plt.axes(projection='3d', elev=50, azim=-50)

ax.plot_surface(x, y, z, norm=LogNorm(), rstride=1, cstride=1, edgecolor='none', alpha=.8, cmap=plt.cm.jet)
ax.plot(*minima, f(*minima), 'r*', markersize=10)

line_3d, = ax.plot([], [], [], 'b', label='Newton-CG', lw=2)
point_3d, = ax.plot([], [], [], 'bo')

ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_zlabel('$z$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))

In [None]:
def init_3d():
    line_3d.set_data([], [])
    line_3d.set_3d_properties([])
    point_3d.set_data([], [])
    point_3d.set_3d_properties([])
    return line, point

In [None]:
def animate_3d(i):
    line_3d.set_data(path[0,:i], path[1,:i])
    line_3d.set_3d_properties(f(*path[::,:i]))
    point_3d.set_data(path[0,i-1:i], path[1,i-1:i])
    point_3d.set_3d_properties(f(*path[::,i-1:i]))
    return line, point

In [None]:
anim = animation.FuncAnimation(fig_3d, animate_3d, init_func=init_3d,
                               frames=path.shape[1], interval=60, 
                               repeat_delay=5, blit=True)

In [None]:
HTML(anim.to_html5_video())

In [None]:
class TrajectoryAnimation(animation.FuncAnimation):
    
    def __init__(self, *paths, labels=[], fig=None, ax=None, frames=None, 
                 interval=60, repeat_delay=5, blit=True, **kwargs):

        if fig is None:
            if ax is None:
                fig, ax = plt.subplots()
            else:
                fig = ax.get_figure()
        else:
            if ax is None:
                ax = fig.gca()

        self.fig = fig
        self.ax = ax
        
        self.paths = paths

        if frames is None:
            frames = max(path.shape[1] for path in paths)
  
        self.lines = [ax.plot([], [], label=label, lw=2)[0] 
                      for _, label in zip_longest(paths, labels)]
        self.points = [ax.plot([], [], 'o', color=line.get_color())[0] 
                       for line in self.lines]

        super(TrajectoryAnimation, self).__init__(fig, self.animate, init_func=self.init_anim,
                                                  frames=frames, interval=interval, blit=blit,
                                                  repeat_delay=repeat_delay, **kwargs)

    def init_anim(self):
        for line, point in zip(self.lines, self.points):
            line.set_data([], [])
            point.set_data([], [])
        return self.lines + self.points

    def animate(self, i):
        for line, point, path in zip(self.lines, self.points, self.paths):
            line.set_data(*path[::,:i])
            point.set_data(*path[::,i-1:i])
        return self.lines + self.points

In [None]:
class TrajectoryAnimation3D(animation.FuncAnimation):
    
    def __init__(self, *paths, zpaths, labels=[], fig=None, ax=None, frames=None, 
                 interval=60, repeat_delay=5, blit=True, **kwargs):

        if fig is None:
            if ax is None:
                fig, ax = plt.subplots()
            else:
                fig = ax.get_figure()
        else:
            if ax is None:
                ax = fig.gca()

        self.fig = fig
        self.ax = ax
        
        self.paths = paths
        self.zpaths = zpaths
        
        if frames is None:
            frames = max(path.shape[1] for path in paths)
  
        self.lines = [ax.plot([], [], [], label=label, lw=2)[0] 
                      for _, label in zip_longest(paths, labels)]

        super(TrajectoryAnimation3D, self).__init__(fig, self.animate, init_func=self.init_anim,
                                                  frames=frames, interval=interval, blit=blit,
                                                  repeat_delay=repeat_delay, **kwargs)

    def init_anim(self):
        for line in self.lines:
            line.set_data([], [])
            line.set_3d_properties([])
        return self.lines

    def animate(self, i):
        for line, path, zpath in zip(self.lines, self.paths, self.zpaths):
            line.set_data(*path[::,:i])
            line.set_3d_properties(zpath[:i])
        return self.lines

In [None]:
from amsgrad import Amsgrad

x0 = np.array([-3., 4.])


algos = {}

w_amsgrad = Variable(torch.FloatTensor(x0), requires_grad=True)
amsgrad = Amsgrad([w_amsgrad])
algos['amsgrad'] = [amsgrad,beales_var,w_amsgrad]

w_adam = Variable(torch.FloatTensor(x0), requires_grad=True)
adam = torch.optim.Adam([w_adam], lr=0.01)
algos['adam'] = [adam,beales_var,w_adam]

w_adadelta = Variable(torch.FloatTensor(x0), requires_grad=True)
adadelta = torch.optim.Adadelta([w_adadelta])
algos['adadelta'] = [adadelta,beales_var,w_adadelta]

w_adagrad = Variable(torch.FloatTensor(x0), requires_grad=True)
adagrad = torch.optim.Adagrad([w_adagrad], lr=1)
algos['adagrad'] = [adagrad,beales_var,w_adagrad]

w_rmsprop = Variable(torch.FloatTensor(x0), requires_grad=True)
rmsprop = torch.optim.RMSprop([w_rmsprop])
algos['rmsprop'] = [rmsprop,beales_var,w_rmsprop]


sgd_lr = 1e-9

w_sgd = Variable(torch.FloatTensor(x0), requires_grad=True)
sgd = torch.optim.SGD([w_sgd], lr=sgd_lr)
algos['sgd'] = [sgd,beales_var,w_sgd]

w_sgdm = Variable(torch.FloatTensor(x0), requires_grad=True)
sgdm = torch.optim.SGD([w_sgdm], lr=1e-11, momentum=0.9)
algos['sgd_momentum'] = [sgdm,beales_var,w_sgdm]

w_nesterov = Variable(torch.FloatTensor(x0), requires_grad=True)
nesterov = torch.optim.SGD([w_nesterov], lr=sgd_lr, momentum=0.9, nesterov=True)
algos['sgd_nesterov'] = [nesterov,beales_var,w_nesterov]


In [None]:
paths = []
methods = []
zpaths = []

for key in algos.keys():
    path = optimize(algos[key][0],algos[key][1],algos[key][2],50000,200).T
    paths.append(path)
    zpaths.append(f(*path))
    methods.append(key)
    

In [None]:
print(w_sgd.data)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

ax.contour(x, y, z, levels=np.logspace(0, 5, 35), norm=LogNorm(), cmap=plt.cm.jet)
ax.plot(*minima, 'r*', markersize=10)

ax.set_xlabel('$x$')
ax.set_ylabel('$y$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))

anim = TrajectoryAnimation(*paths, labels=methods, ax=ax)

ax.legend(loc='upper left')

In [None]:
HTML(anim.to_html5_video())

In [None]:
fig = plt.figure(figsize=(8, 5))
ax = plt.axes(projection='3d', elev=50, azim=-50)

ax.plot_surface(x, y, z, norm=LogNorm(), rstride=1, cstride=1, edgecolor='none', alpha=.8, cmap=plt.cm.jet)
ax.plot(*minima, f(*minima), 'r*', markersize=10)

ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_zlabel('$z$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))

anim = TrajectoryAnimation3D(*paths, zpaths=zpaths, labels=methods, ax=ax)

ax.legend(loc='upper left')

In [None]:
HTML(anim.to_html5_video())

${\displaystyle f(x,y)=\sin ^{2}3\pi x+\left(x-1\right)^{2}\left(1+\sin ^{2}3\pi y\right)}
{\displaystyle +\left(y-1\right)^{2}\left(1+\sin ^{2}2\pi y\right)}$

In [None]:
# Rosenbrock Function

pi = np.pi
# Lévi function 
# f  = lambda x, y: (np.sin(3*pi*x))**2 + ((x-1)**2)*(1+np.sin(3*pi*y)**2)+((y-1)**2)*(1+np.sin(2*pi*y)**2)
f = lambda x,y: (1-x)**2 + 100*(y-x**2)**2
def f_var(var):
    x = var[0]
    y = var[1]
    return (1-x).pow(2) + 100*(y-x.pow(2)).pow(2)
#     return (3*pi*x).sin().pow(2) + ((x-1).pow(2))*(1+(3*pi*y).sin().pow(2))+((y-1).pow(2))*(1+(2*pi*y).sin().pow(2))

In [None]:
f(1,1)

In [None]:
xmin, xmax, xstep = -10, 10, .4
ymin, ymax, ystep = -10, 10, .4

In [None]:
x, y = np.meshgrid(np.arange(xmin, xmax + xstep, xstep), np.arange(ymin, ymax + ystep, ystep))
z = f(x,y)

In [None]:
minima = getMinima(1,1)
f(1,1)

In [None]:
# rand = lambda : np.add(np.random.rand(1,2)*16,-8*np.ones((1,2)))[0]
rand = lambda : np.array([-4., 3.])

In [None]:
rand()[0]

In [None]:
from amsgrad import Amsgrad

# x0 = np.array([-3., 4.])
x0 = np.array([-4., 3.])

algos = {}

w_amsgrad = Variable(torch.FloatTensor(rand()), requires_grad=True)
amsgrad = Amsgrad([w_amsgrad])
algos['amsgrad'] = [amsgrad,f_var,w_amsgrad]

w_adam = Variable(torch.FloatTensor(rand()), requires_grad=True)
adam = torch.optim.Adam([w_adam], lr=0.01)
algos['adam'] = [adam,f_var,w_adam]

w_adadelta = Variable(torch.FloatTensor(rand()), requires_grad=True)
adadelta = torch.optim.Adadelta([w_adadelta])
algos['adadelta'] = [adadelta,f_var,w_adadelta]

w_adagrad = Variable(torch.FloatTensor(rand()), requires_grad=True)
adagrad = torch.optim.Adagrad([w_adagrad], lr=1)
algos['adagrad'] = [adagrad,f_var,w_adagrad]

w_rmsprop = Variable(torch.FloatTensor(rand()), requires_grad=True)
rmsprop = torch.optim.RMSprop([w_rmsprop])
algos['rmsprop'] = [rmsprop,f_var,w_rmsprop]


sgd_lr = 1e-9

w_sgd = Variable(torch.FloatTensor(rand()), requires_grad=True)
sgd = torch.optim.SGD([w_sgd], lr=sgd_lr)
algos['sgd'] = [sgd,f_var,w_sgd]

w_sgdm = Variable(torch.FloatTensor(rand()), requires_grad=True)
sgdm = torch.optim.SGD([w_sgdm], lr=1e-11, momentum=0.9)
algos['sgd_momentum'] = [sgdm,f_var,w_sgdm]

w_nesterov = Variable(torch.FloatTensor(rand()), requires_grad=True)
nesterov = torch.optim.SGD([w_nesterov], lr=sgd_lr, momentum=0.9, nesterov=True)
algos['sgd_nesterov'] = [nesterov,f_var,w_nesterov]


In [None]:
paths = []
methods = []
zpaths = []

for key in algos.keys():
    path = optimize(algos[key][0],algos[key][1],algos[key][2],50000,100).T
    paths.append(path)
    zpaths.append(f(*path))
    methods.append(key)
    

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

ax.contour(x, y, z, levels=np.logspace(0, 5, 35), norm=LogNorm(), cmap=plt.cm.jet)
ax.plot(*minima, 'r*', markersize=10)

ax.set_xlabel('$x$')
ax.set_ylabel('$y$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))

anim = TrajectoryAnimation(*paths, labels=methods, ax=ax)

ax.legend(loc='upper left')

In [None]:
HTML(anim.to_html5_video())

In [None]:
fig = plt.figure(figsize=(8, 5))
ax = plt.axes(projection='3d', elev=50, azim=-50)

ax.plot_surface(x, y, z, norm=LogNorm(), rstride=1, cstride=1, edgecolor='none', alpha=.8, cmap=plt.cm.jet)
ax.plot(*minima, f(*minima), 'r*', markersize=10)

ax.set_xlabel('$x$')
ax.set_ylabel('$y$')
ax.set_zlabel('$z$')

ax.set_xlim((xmin, xmax))
ax.set_ylim((ymin, ymax))

anim = TrajectoryAnimation3D(*paths, zpaths=zpaths, labels=methods, ax=ax)

ax.legend(loc='upper left')

In [None]:
HTML(anim.to_html5_video())