# Bài 1: Gradient Descent

In [5]:
import numpy as np

def df_w(W):
    """
    Compute the gradents of dw1 and dw2.
    Arguments:
    W -- np.array [w1, w2]
    Returns:
    dw -- np.array [dw1, dw2], array containing the partial derivatives
    """
    w1, w2 = W

    dw1 = 2 * w1
    dw2 = 2 * w2

    dw = np.array([dw1, dw2])
    return dw

In [6]:
def sgd(W, dw, lr):
    """
    Perform a single gradient descent step to update w1 and w2.
    Arguments:
    W -- np.array [w1, w2]
    dw -- np.array [dw1, dw2], array of gradients
    lr -- float, learning rate
    Returns:
    W -- np.array [w1, w2] after update
    """
    W = W - lr * dw
    return W

In [7]:
def train_p1(optimizer, lr, epochs):
    """
    Train to find the minimum using Gradient Descent.
    Arguments:
    optimizer -- function for optimization (sgd)
    lr -- float, learning rate
    epochs -- int, number of epochs
    Returns:
    results -- list of [w1, w2] values after each epoch
    """
    # Initial point
    W = np.array([-5.0, -2.0], dtype=np.float32)
    # List of results
    results = [W]

    # Loop through epochs
    for epoch in range(epochs):
        # Compute gradients
        dw = df_w(W)
        # Update weights using the optimizer
        W = optimizer(W, dw, lr)
        # Append the new weights to results
        results.append(W)

    return results

In [8]:
# # Parameters
# learning_rate = 0.1
# epochs = 2

# # Run training
# results = train_p1(sgd, learning_rate, epochs)

# # Print results
# for i, W in enumerate(results):
#     print(f"Epoch {i}: w1 = {W[0]}, w2 = {W[1]}")

In [9]:
learning_rate = 0.1
epochs = 30
results = train_p1(sgd, learning_rate, epochs)

for i, W in enumerate(results):
    print(f"Epoch {i}: w1 = {W[0]}, w2 = {W[1]}")

Epoch 0: w1 = -5.0, w2 = -2.0
Epoch 1: w1 = -4.0, w2 = -1.6
Epoch 2: w1 = -3.2, w2 = -1.28
Epoch 3: w1 = -2.56, w2 = -1.024
Epoch 4: w1 = -2.048, w2 = -0.8192
Epoch 5: w1 = -1.6384, w2 = -0.65536
Epoch 6: w1 = -1.31072, w2 = -0.5242880000000001
Epoch 7: w1 = -1.0485760000000002, w2 = -0.4194304000000001
Epoch 8: w1 = -0.8388608000000002, w2 = -0.33554432000000006
Epoch 9: w1 = -0.6710886400000001, w2 = -0.26843545600000007
Epoch 10: w1 = -0.5368709120000001, w2 = -0.21474836480000006
Epoch 11: w1 = -0.4294967296000001, w2 = -0.17179869184000005
Epoch 12: w1 = -0.3435973836800001, w2 = -0.13743895347200002
Epoch 13: w1 = -0.27487790694400005, w2 = -0.10995116277760002
Epoch 14: w1 = -0.21990232555520003, w2 = -0.08796093022208001
Epoch 15: w1 = -0.17592186044416003, w2 = -0.070368744177664
Epoch 16: w1 = -0.140737488355328, w2 = -0.056294995342131206
Epoch 17: w1 = -0.11258999068426241, w2 = -0.04503599627370496
Epoch 18: w1 = -0.09007199254740993, w2 = -0.03602879701896397
Epoch 19: w1

# Bài 2: Gradient Descent + Momentum


In [10]:
def df_w(W):
    """
    Compute the gradients of dw1 and dw2 for the function.
    Arguments:
    W -- np.array [w1, w2]
    Returns:
    dw -- np.array [dw1, dw2]
    """
    w1, w2 = W
    dw1 = 2 * w1
    dw2 = 2 * w2
    return np.array([dw1, dw2])

In [11]:
def sgd_momentum(W, V, dw, lr, beta):
    """
    Perform the gradient descent update with momentum.
    Arguments:
    W -- np.array [w1, w2]
    V -- np.array [V1, V2], velocity vector
    dw -- np.array [dw1, dw2], gradients
    lr -- float, learning rate
    beta -- float, momentum factor
    Returns:
    W -- np.array [w1, w2] after update
    V -- np.array [V1, V2], updated velocity
    """
    # Update velocity (equation 2.1)
    V = beta * V + (1 - beta) * dw
    # Update weights (equation 2.2)
    W = W - lr * V
    return W, V

In [14]:
# Initial parameters
W = np.array([-5.0, -2.0], dtype=np.float32)  # Initial weights [w1, w2]
V = np.array([0.0, 0.0], dtype=np.float32)    # Initial velocity [v1, v2]
learning_rate = 0.1                           # Learning rate
beta = 0.9                                    # Momentum factor
epochs = 30 # 2                                   # Number of epochs

In [15]:
# Perform Gradient Descent + Momentum for 2 epochs
for epoch in range(1, epochs + 1):
    print(f"Epoch {epoch}:")

    # Step 1: Compute the gradients
    dw = df_w(W)
    print(f"  Gradient at W: dw1 = {dw[0]}, dw2 = {dw[1]}")

    # Step 2 & 3: Update velocity and weights using Gradient Descent + Momentum
    W, V = sgd_momentum(W, V, dw, learning_rate, beta)
    print(f"  Updated W: w1 = {W[0]}, w2 = {W[1]}")
    print(f"  Updated Velocity: V1 = {V[0]}, V2 = {V[1]}")

Epoch 1:
  Gradient at W: dw1 = -10.0, dw2 = -4.0
  Updated W: w1 = -4.9, w2 = -1.96
  Updated Velocity: V1 = -0.9999999999999998, V2 = -0.3999999999999999
Epoch 2:
  Gradient at W: dw1 = -9.8, dw2 = -3.92
  Updated W: w1 = -4.712000000000001, w2 = -1.8848
  Updated Velocity: V1 = -1.8799999999999997, V2 = -0.7519999999999998
Epoch 3:
  Gradient at W: dw1 = -9.424000000000001, dw2 = -3.7696
  Updated W: w1 = -4.4485600000000005, w2 = -1.7794240000000001
  Updated Velocity: V1 = -2.6343999999999994, V2 = -1.0537599999999998
Epoch 4:
  Gradient at W: dw1 = -8.897120000000001, dw2 = -3.5588480000000002
  Updated W: w1 = -4.122492800000001, w2 = -1.6489971200000002
  Updated Velocity: V1 = -3.2606719999999996, V2 = -1.3042687999999998
Epoch 5:
  Gradient at W: dw1 = -8.244985600000001, dw2 = -3.2979942400000004
  Updated W: w1 = -3.7465824640000007, w2 = -1.4986329856000002
  Updated Velocity: V1 = -3.7591033599999997, V2 = -1.5036413439999998
Epoch 6:
  Gradient at W: dw1 = -7.49316492800

# Bài 3: RMSProp

In [16]:
def df_w(W):
    """
    Compute the gradents of dw1 and dw2.
    Arguments:
    W -- np.array [w1, w2]
    Returns:
    dw -- np.array [dw1, dw2], array containing the partial derivatives
    """
    w1, w2 = W

    dw1 = 2 * w1
    dw2 = 2 * w2

    dw = np.array([dw1, dw2])
    return dw

In [17]:
# Define the RMSProp update function
def rmsprop(W, S, dw, lr, gamma, epsilon):
    """
    Perform the RMSProp update.
    Arguments:
    W -- np.array [w1, w2]
    S -- np.array [s1, s2], the moving average of squared gradients
    dw -- np.array [dw1, dw2], gradients
    lr -- float, learning rate
    gamma -- float, decay rate
    epsilon -- float, a small constant to avoid division by zero
    Returns:
    W -- np.array [w1, w2] after the update
    S -- np.array [s1, s2], updated moving average of squared gradients
    """
    # Update S using equation 3.1
    S = gamma * S + (1 - gamma) * (dw ** 2)
    # Update W using equation 3.2
    W = W - (lr * dw) / (np.sqrt(S) + epsilon)
    return W, S

In [18]:
# Training loop for RMSProp
def train_p1_rmsprop(lr, gamma, epsilon, epochs):
    """
    Train to find the minimum using RMSProp.
    Arguments:
    lr -- float, learning rate
    gamma -- float, decay rate
    epsilon -- float, a small constant to avoid division by zero
    epochs -- int, number of epochs
    Returns:
    results -- list of [w1, w2] values after each epoch
    """
    # Initial point
    W = np.array([-5.0, -2.0], dtype=np.float32)  # Initial weights
    S = np.array([0.0, 0.0], dtype=np.float32)    # Initial moving average of squared gradients
    # List of results to store W at each epoch
    results = [W]

    # Loop through epochs
    for epoch in range(epochs):
        # Step 1: Compute gradients
        dw = df_w(W)
        # Step 2 & 3: Update S and W using RMSProp
        W, S = rmsprop(W, S, dw, lr, gamma, epsilon)
        # Append the new weights to results
        results.append(W)

    return results

In [19]:
# Parameters for RMSProp
learning_rate = 0.1  # Learning rate
gamma = 0.9          # Decay rate
epsilon = 1e-8       # Small constant to prevent division by zero
epochs = 30          # Number of epochs

In [20]:
# Perform RMSProp for 2 epochs
results = train_p1_rmsprop(learning_rate, gamma, epsilon, epochs)

# Display the results
for i, W in enumerate(results):
    print(f"Epoch {i}: w1 = {W[0]}, w2 = {W[1]}")

Epoch 0: w1 = -5.0, w2 = -2.0
Epoch 1: w1 = -4.683772234983162, w2 = -1.683772236483162
Epoch 2: w1 = -4.461584607692758, w2 = -1.4738753171325136
Epoch 3: w1 = -4.27928872720579, w2 = -1.3087178712916374
Epoch 4: w1 = -4.120054383810811, w2 = -1.1698398157391754
Epoch 5: w1 = -3.9761536215788174, w2 = -1.0489273067370706
Epoch 6: w1 = -3.843310019236342, w2 = -0.9414504929904555
Epoch 7: w1 = -3.7188778308800394, w2 = -0.8446493453528385
Epoch 8: w1 = -3.601093559818063, w2 = -0.7567142198996779
Epoch 9: w1 = -3.488717515023634, w2 = -0.6763956470812718
Epoch 10: w1 = -3.380842895309756, w2 = -0.6027968561678372
Epoch 11: w1 = -3.276785787876371, w2 = -0.5352538875577688
Epoch 12: w1 = -3.1760178230016334, w2 = -0.47326154058825287
Epoch 13: w1 = -3.078122901858806, w2 = -0.4164249923493902
Epoch 14: w1 = -2.9827682737992003, w2 = -0.3644266043432175
Epoch 15: w1 = -2.8896845602323835, w2 = -0.31700215000026094
Epoch 16: w1 = -2.7986515704878028, w2 = -0.2739231571205742
Epoch 17: w1 

# Bài 4: Adam

In [21]:
def df_w(W):
    """
    Compute the gradents of dw1 and dw2.
    Arguments:
    W -- np.array [w1, w2]
    Returns:
    dw -- np.array [dw1, dw2], array containing the partial derivatives
    """
    w1, w2 = W

    dw1 = 2 * w1
    dw2 = 2 * w2

    dw = np.array([dw1, dw2])
    return dw

In [22]:
def adam(W, V, S, dw, lr, beta1, beta2, epsilon, t):
    """
    Perform the Adam optimization update.
    Arguments:
    W -- np.array [w1, w2], weights
    V -- np.array [v1, v2], first moment estimate
    S -- np.array [s1, s2], second moment estimate
    dw -- np.array [dw1, dw2], gradients
    lr -- float, learning rate
    beta1 -- float, exponential decay rate for first moment estimates
    beta2 -- float, exponential decay rate for second moment estimates
    epsilon -- float, small constant to prevent division by zero
    t -- int, timestep
    Returns:
    W -- np.array [w1, w2] after update
    V -- np.array [v1, v2], updated first moment estimate
    S -- np.array [s1, s2], updated second moment estimate
    """
    # Update biased first moment estimate (v_t)
    V = beta1 * V + (1 - beta1) * dw
    # Update biased second raw moment estimate (s_t)
    S = beta2 * S + (1 - beta2) * (dw ** 2)
    # Compute bias-corrected first moment estimate
    V_corr = V / (1 - beta1 ** t)
    # Compute bias-corrected second moment estimate
    S_corr = S / (1 - beta2 ** t)
    # Update weights
    W = W - (lr * V_corr) / (np.sqrt(S_corr) + epsilon)
    return W, V, S

In [23]:
def train_p1_adam(lr, beta1, beta2, epsilon, epochs):
    """
    Train to find the minimum using Adam optimization.
    Arguments:
    lr -- float, learning rate
    beta1 -- float, exponential decay rate for first moment estimates
    beta2 -- float, exponential decay rate for second moment estimates
    epsilon -- float, small constant to prevent division by zero
    epochs -- int, number of epochs
    Returns:
    results -- list of [w1, w2] values after each epoch
    """
    # Initial point
    W = np.array([-5.0, -2.0], dtype=np.float32)  # Initial weights
    V = np.array([0.0, 0.0], dtype=np.float32)    # Initial first moment
    S = np.array([0.0, 0.0], dtype=np.float32)    # Initial second moment
    # List of results to store W at each epoch
    results = [W]

    # Loop through epochs
    for epoch in range(1, epochs + 1):
        # Compute gradients
        dw = df_w(W)
        # Update weights using Adam optimization
        W, V, S = adam(W, V, S, dw, lr, beta1, beta2, epsilon, epoch)
        # Append the new weights to results
        results.append(W)

    return results

In [24]:
# Parameters for Adam
learning_rate = 0.1  # Learning rate
beta1 = 0.9          # Exponential decay rate for first moment estimates
beta2 = 0.999        # Exponential decay rate for second moment estimates
epsilon = 1e-8       # Small constant to prevent division by zero
epochs = 30           # Number of epochs

In [25]:
# Perform Adam optimization for  epochs
results = train_p1_adam(learning_rate, beta1, beta2, epsilon, epochs)

# Display the results
for i, W in enumerate(results):
    print(f"Epoch {i}: w1 = {W[0]}, w2 = {W[1]}")

Epoch 0: w1 = -5.0, w2 = -2.0
Epoch 1: w1 = -4.9000000001, w2 = -1.90000000025
Epoch 2: w1 = -4.800057756868857, w2 = -1.8001664861157012
Epoch 3: w1 = -4.700213291333392, w2 = -1.7006233920464653
Epoch 4: w1 = -4.600507673963063, w2 = -1.601504895289421
Epoch 5: w1 = -4.500982947818709, w2 = -1.5029557812950123
Epoch 6: w1 = -4.401682046815696, w2 = -1.4051317301601145
Epoch 7: w1 = -4.302648709562401, w2 = -1.3081994956139178
Epoch 8: w1 = -4.203927389258771, w2 = -1.2123369431278836
Epoch 9: w1 = -4.105563160155665, w2 = -1.1177329089643084
Epoch 10: w1 = -4.007601621083933, w2 = -1.0245868378253595
Epoch 11: w1 = -3.9100887965630893, w2 = -0.9331081538332233
Epoch 12: w1 = -3.8130710359932927, w2 = -0.8435153186154638
Epoch 13: w1 = -3.7165949114219345, w2 = -0.7560345321554123
Epoch 14: w1 = -3.620707114358172, w2 = -0.6708980377073323
Epoch 15: w1 = -3.5254543520861086, w2 = -0.5883420023105994
Epoch 16: w1 = -3.4308832439010026, w2 = -0.508603959872017
Epoch 17: w1 = -3.33704021