<a href="https://colab.research.google.com/github/hjung31/park/blob/main/Assignment3_2021711420_%EB%B0%95%ED%98%84%EC%A0%95.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install d2l -q
from d2l import tensorflow as d2l

import numpy as np
import tensorflow as tf
import math
import plotly.graph_objects as go

In [None]:
def plot(X, Y=None, xlabel=None, ylabel=None, legend=[], xlim=None,
         ylim=None, xscale='linear', yscale='linear',
         fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None):
    """Plot data points.
    Defined in :numref:`sec_calculus`"""

    def has_one_axis(X):  # True if `X` (tensor or list) has 1 axis
        return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list)
                and not hasattr(X[0], "__len__"))

    if has_one_axis(X): X = [X]
    if Y is None:
        X, Y = [[]] * len(X), X
    elif has_one_axis(Y):
        Y = [Y]
    if len(X) != len(Y):
        X = X * len(Y)

    #set_figsize(figsize)
    if axes is None: 
        axes = d2l.plt.gca()
    axes.cla()
    for x, y, fmt in zip(X, Y, fmts):
        axes.plot(x,y,fmt) if len(x) else axes.plot(y,fmt)
    set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)

In [None]:
def show_trace(results, f):
    n = max(abs(min(results)), abs(max(results)))
    f_line = tf.range(-n, n, 0.01)
    
    #d2l.set_figsize()
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=f_line,y=[f(x) for x in f_line]))
    fig.add_trace(go.Scatter(x=results, y=[f(x) for x in results]))
    
    fig.update_layout(
        template='simple_white',
        xaxis_title = 'x',
        yaxis_title = 'f(x)',
        showlegend = False)
    
    fig.show()

In [None]:
def train_2d(trainer, steps=20, f_grad=None):
    """Optimize a 2D objective function with a customized trainer."""
    # `s1` and `s2` are internal state variables that will be used later
    x1, x2, s1, s2 = -5, -2, 0, 0
    results = [(x1, x2)]
    for i in range(steps):
        if f_grad:
            x1, x2, s1, s2 = trainer(x1, x2, s1, s2, f_grad)
        else:
            x1, x2, s1, s2 = trainer(x1, x2, s1, s2)
        results.append((x1, x2))
    print(f'epoch {i + 1}, x1: {float(x1):f}, x2: {float(x2):f}')
    return results

def show_trace_2d(f, results):
    """Show the trace of 2D variables during optimization."""
    
    x1, x2 = tf.meshgrid(tf.range(-5.5, 1.0, 0.1),tf.range(-3.0, 1.0, 0.1))
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(x=[p[0].numpy()[0] if tf.is_tensor(p[0]) else p[0] for p in results],
                             y=[p[1].numpy()[0] if tf.is_tensor(p[1]) else p[1] for p in results],
                             mode = 'lines+markers',
                             marker_color = '#ff7f0e'))

    fig.add_trace(go.Contour(z= f(x1,x2), 
                             x= tf.range(-5.5, 1.0, 0.1),
                             y= tf.range(-3.0, 1.0, 0.1),
                             contours_coloring='lines')
                 )
    
    fig.update_layout(
        template='simple_white',
        xaxis_title = 'x1',
        yaxis_title = 'x2',
        showlegend = False)
    
    fig.show()

In [None]:
def train_ch11(trainer_fn, states, hyperparams, data_iter,
               feature_dim, num_epochs=2):
    # Initialization
    w = tf.Variable(tf.random.normal(shape=(feature_dim, 1),
                                   mean=0, stddev=0.01),trainable=True)
    b = tf.Variable(tf.zeros(1), trainable=True)

    # Train
    net, loss = lambda X: d2l.linreg(X, w, b), d2l.squared_loss

    n, timer = 0, d2l.Timer()

    vals = []
    for _ in range(num_epochs):
        for X, y in data_iter:
            with tf.GradientTape() as g:
                l = tf.math.reduce_mean(loss(net(X), y))

            dw, db = g.gradient(l, [w, b])
            trainer_fn([w, b], [dw, db], states, hyperparams)
            n += X.shape[0]
            if n % 200 == 0:
                #print(n)
                timer.stop()
                p = n/X.shape[0]
                q = p/tf.data.experimental.cardinality(data_iter).numpy()
                r = (d2l.evaluate_loss(net, data_iter, loss),)
                #print(q,r)
                vals.append((q, r))
                timer.start()
    
    print(f'loss: {min([p[1][0] for p in vals]):.3f}, {timer.avg():.3f} sec/epoch')
    animator_plotly(vals)
    return timer.cumsum(), [p[1][0] for p in vals]

In [None]:
def train_sgd(lr, batch_size, num_epochs=2):
    data_iter, feature_dim = get_data_ch11(batch_size)
    return train_ch11(
        sgd, None, {'lr': lr}, data_iter, feature_dim, num_epochs)

In [None]:
def train_concise_ch11(trainer_fn, hyperparams, data_iter, num_epochs=2):
    # Initialization
    net = tf.keras.Sequential()
    net.add(tf.keras.layers.Dense(1,
            kernel_initializer=tf.random_normal_initializer(stddev=0.01)))
    optimizer = trainer_fn(**hyperparams)
    loss = tf.keras.losses.MeanSquaredError()

    n, timer = 0, d2l.Timer()
    vals = []
    for _ in range(num_epochs):
        for X, y in data_iter:
            with tf.GradientTape() as g:
                out = net(X)
                l = loss(y, out)
                params = net.trainable_variables
                grads = g.gradient(l, params)
            optimizer.apply_gradients(zip(grads, params))
            n += X.shape[0]
            if n % 200 == 0:
                timer.stop()
                p = n/X.shape[0]
                q = p/tf.data.experimental.cardinality(data_iter).numpy()
                # `MeanSquaredError` computes squared error without the 1/2
                # factor
                r = (d2l.evaluate_loss(net, data_iter, loss) / 2,)
                timer.start()
                vals.append((q, r))
                
    animator_plotly(vals)

In [None]:
def animator_plotly(vals):
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=[p[0] for p in vals], y=[p[1][0] for p in vals]))
    fig.update_layout(
        template='simple_white',
        xaxis_title='epoch',
        yaxis_title='loss')
    fig.show()

# adagrad

In [None]:
def adagrad_2d(x1, x2, s1, s2):
    eps = 1e-6
    g1, g2 = 0.2 * x1, 4 * x2
    s1 += g1 ** 2
    s2 += g2 ** 2
    x1 -= eta / math.sqrt(s1 + eps) * g1
    x2 -= eta / math.sqrt(s2 + eps) * g2
    return x1, x2, s1, s2

def f_2d(x1, x2):
    return 0.1 * x1 ** 2 + 2 * x2 ** 2

def init_adagrad_states(feature_dim):
    s_w = tf.Variable(tf.zeros((feature_dim, 1)))
    s_b = tf.Variable(tf.zeros(1))
    return (s_w, s_b)

def adagrad(params, grads, states, hyperparams):
    eps = 1e-6
    for p, s, g in zip(params, states, grads):
        s[:].assign(s + tf.math.square(g))
        p[:].assign(p - hyperparams['lr'] * g / tf.math.sqrt(s + eps))

eta = 0.2
show_trace_2d(f_2d, train_2d(adagrad_2d))

epoch 20, x1: -3.578139, x2: -0.753152


In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(adagrad, init_adagrad_states(feature_dim),
               {'lr': 0.1}, data_iter, feature_dim);

loss: 0.242, 0.192 sec/epoch


In [None]:
eta = 0.4
show_trace_2d(f_2d, train_2d(adagrad_2d))

epoch 20, x1: -2.382563, x2: -0.158591


In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(adagrad, init_adagrad_states(feature_dim),
               {'lr': 0.1}, data_iter, feature_dim);

loss: 0.243, 0.305 sec/epoch


In [None]:
eta = 0.6
show_trace_2d(f_2d, train_2d(adagrad_2d))

epoch 20, x1: -1.452966, x2: -0.016979


In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(adagrad, init_adagrad_states(feature_dim),
               {'lr': 0.1}, data_iter, feature_dim);

loss: 0.243, 0.302 sec/epoch


In [None]:
eta = 0.8
show_trace_2d(f_2d, train_2d(adagrad_2d))

epoch 20, x1: -0.801064, x2: -0.000918


In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(adagrad, init_adagrad_states(feature_dim),
               {'lr': 0.1}, data_iter, feature_dim);

loss: 0.242, 0.206 sec/epoch


In [None]:
eta = 2
show_trace_2d(f_2d, train_2d(adagrad_2d))

epoch 20, x1: -0.002295, x2: -0.000000


In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(adagrad, init_adagrad_states(feature_dim),
               {'lr': 0.1}, data_iter, feature_dim);

loss: 0.242, 0.094 sec/epoch


In [None]:
trainer = tf.keras.optimizers.Adagrad
train_concise_ch11(trainer, {'learning_rate' : 0.4}, data_iter)

## RMSprop

In [None]:
gammas = [0.95, 0.9, 0.8, 0.7]
fig = go.Figure()
for gamma in gammas:
    x = [p for p in tf.range(40).numpy()]
    fig.add_trace(go.Scatter(x=x, y=[(1-gamma) * gamma ** p for p in x], name=f'gamma = {gamma:.2f}'))
fig.update_layout(
    xaxis_title='time',
    template='simple_white')
fig.show()

In [None]:
def rmsprop_2d(x1, x2, s1, s2):
  g1, g2, eps = 0.2 * x1, 4 * x2, 1e-6
  s1 = gamma * s1 + (1 - gamma) * g1 ** 2
  s2 = gamma * s2 + (1 - gamma) * g2 ** 2
  x1 -= eta / math.sqrt(s1 + eps) * g1
  x2 -= eta / math.sqrt(s2 + eps) * g2
  return x1, x2, s1, s2

def f_2d(x1, x2):
  return 0.1 * x1 ** 2 + 2 * x2 ** 2

eta, gamma = 0.4, 0.9
show_trace_2d(f_2d, train_2d(rmsprop_2d))

epoch 20, x1: -0.010599, x2: 0.000000


In [None]:
def init_rmsprop_states(feature_dim):
    s_w = tf.Variable(tf.zeros((feature_dim, 1)))
    s_b = tf.Variable(tf.zeros(1))
    return (s_w, s_b)
  
def rmsprop(params, grads, states, hyperparams):
    gamma, eps = hyperparams['gamma'], 1e-6
    for p, s, g in zip(params, states, grads):
        s[:].assign(gamma * s + (1 - gamma) * tf.math.square(g))
        p[:].assign(p - hyperparams['lr'] * g / tf.math.sqrt(s + eps))

In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(rmsprop, init_rmsprop_states(feature_dim),
               {'lr': 0.01, 'gamma': 0.9}, data_iter, feature_dim);

loss: 0.243, 0.138 sec/epoch


In [None]:
eta, gamma = 0.4, 0.7
show_trace_2d(f_2d, train_2d(rmsprop_2d))

epoch 20, x1: -0.000145, x2: -0.007692


In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(rmsprop, init_rmsprop_states(feature_dim),
               {'lr': 0.01, 'gamma': 0.7}, data_iter, feature_dim);

loss: 0.243, 0.125 sec/epoch


In [None]:
eta, gamma = 0.4, 0.5
show_trace_2d(f_2d, train_2d(rmsprop_2d))

epoch 20, x1: 0.007544, x2: 0.229752


In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(rmsprop, init_rmsprop_states(feature_dim),
               {'lr': 0.01, 'gamma': 0.5}, data_iter, feature_dim);

loss: 0.243, 0.124 sec/epoch


In [None]:
eta, gamma = 0.8, 0.9
show_trace_2d(f_2d, train_2d(rmsprop_2d))

epoch 20, x1: 0.000000, x2: -0.255285


In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(rmsprop, init_rmsprop_states(feature_dim),
               {'lr': 0.01, 'gamma': 0.9}, data_iter, feature_dim);

loss: 0.243, 0.098 sec/epoch


In [None]:
trainer = tf.keras.optimizers.RMSprop
train_concise_ch11(trainer, {'learning_rate': 0.01, 'rho': 0.9},
                       data_iter)

## Adadelta

In [None]:
def init_adadelta_states(feature_dim):
    s_w = tf.Variable(tf.zeros((feature_dim, 1)))
    s_b = tf.Variable(tf.zeros(1))
    delta_w = tf.Variable(tf.zeros((feature_dim, 1)))
    delta_b = tf.Variable(tf.zeros(1))
    return ((s_w, delta_w), (s_b, delta_b))

def adadelta(params, grads, states, hyperparams):
    rho, eps = hyperparams['rho'], 1e-5
    for p, (s, delta), grad in zip(params, states, grads):
        s[:].assign(rho * s + (1 - rho) * tf.math.square(grad))
        g = (tf.math.sqrt(delta + eps) / tf.math.sqrt(s + eps)) * grad
        p[:].assign(p - g)
        delta[:].assign(rho * delta + (1 - rho) * g * g)

In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(adadelta, init_adadelta_states(feature_dim),
               {'rho': 0.9}, data_iter, feature_dim);

loss: 0.244, 0.130 sec/epoch


In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(adadelta, init_adadelta_states(feature_dim),
               {'rho': 0.7}, data_iter, feature_dim);

loss: 0.243, 0.126 sec/epoch


In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(adadelta, init_adadelta_states(feature_dim),
               {'rho': 0.3}, data_iter, feature_dim);

loss: 0.244, 0.142 sec/epoch


In [None]:
data_iter, feature_dim = d2l.get_data_ch11(batch_size=10)
train_ch11(adadelta, init_adadelta_states(feature_dim),
               {'rho': 0.1}, data_iter, feature_dim);

loss: 0.244, 0.151 sec/epoch


In [None]:
trainer = tf.keras.optimizers.Adadelta
train_concise_ch11(trainer, {'learning_rate':5.0, 'rho': 0.9}, data_iter)