# Optimizers implementation


In [395]:
# Import neccesary libraries
import numpy as np

## Explaining the problem to myself

### 1. Function

$f(x) = sin2x + asin(4x)$

The goal is to minimize this function (find where it outputs the smallest value). In machine learning, this usually means adjusting parameters to reduce the loss.

### 2. Gradient Descent

$\theta = \theta - \eta  \cdot \nabla f(\theta)$

where $\theta$ is weights and bias and all parameters

In ML, this would be updating the weights, $w = w- \eta  \cdot \nabla L(w)$


In [396]:
# Define function
def f(x, a=0.499):
    return np.sin(2 * x) + a * np.sin(4 * x)


def f_prime(x, a=0.499):
    return 2 * np.cos(2 * x) + (a * 4) * np.cos(4 * x)

### Implementation

Implementing an optimizer means writing a Python function (or class) that:

1. Takes an initial guess for the parameters.
2. Computes the gradient of the given function at each step.
3. Updates the parameters using the gradient descent rule.
4. Stops after a set number of steps or when the updates become small enough (converges).


## Vanilla optimizer


In [397]:
initial_guess = weight = 0.01
max_iters = 10
learning_rate = 0.01

for i in range(max_iters):
    grad = f_prime(weight) # Compute gradient
    weight -= learning_rate * grad
    print(weight)
    if abs(grad) < 10e-13:
        print(f"Breaks on the {i+1}th iteration")
        break

-0.02994003426228467
-0.06972122169230724
-0.10871590025282594
-0.14634726718029928
-0.1821332686723779
-0.21571416593916687
-0.24686094115139826
-0.27546691378698396
-0.3015281748313631
-0.32511910395041105


### Learning rates to test

In [398]:
# List of test learning rates
learning_rates = [0.1, 0.01, 0.001] 

In [399]:
def vanilla_optimizer(f, f_prime, max_iters=5000, learning_rate=0.001, initial_guess = None):
    if initial_guess is None:
        initial_guess = 0.75
    weight = initial_guess
    pred_1 = f(weight)

    for i in range(max_iters):
        grad = f_prime(weight) # Compute gradient
        weight -= learning_rate * grad # update weights/parameters
        pred_2 = f(weight)
        if abs(pred_2 - pred_1) < 1e-13: # Convergence tolerance
            print(f"Converged on {i+1}th iteration")
            break
        pred_1 = pred_2

    print("Min x (weight):",weight)
    print("At x (weight), y =",f(weight))
    print("Gradient at x (weight):",f_prime(weight))
    
    return weight, f(weight), f_prime(weight) 

In [400]:
for lr in learning_rates:
    print(f"---Results for learning rate = {lr}---\n")
    vanilla_optimizer(f, f_prime, learning_rate=lr)
    print()

---Results for learning rate = 0.1---

Converged on 151th iteration
Min x (weight): 2.617801150048571
At x (weight), y = -1.29817227299419
Gradient at x (weight): 5.4001912941359365e-09

---Results for learning rate = 0.01---

Converged on 1574th iteration
Min x (weight): 2.6178008915128057
At x (weight), y = -1.2981722729938445
Gradient at x (weight): -2.6772025890631213e-06

---Results for learning rate = 0.001---

Min x (weight): 1.5610820325228227
At x (weight), y = 4.251390869903712e-05
Gradient at x (weight): -0.005129212682828843



## Vanilla Momentum optimizer


In [401]:
def vanilla_momentum_optimizer(f, f_prime, max_iters=5000, learning_rate=0.001, initial_guess = None):
    if initial_guess is None:
        initial_guess = 0.75
    weight = initial_guess
    pred_1 = f(weight)
    beta = 0.9 # set the friction
    momentum = 0 # initialize velocity
    
    for i in range(max_iters):
        grad = f_prime(weight) # Compute gradient
        momentum = beta * momentum - learning_rate * grad
        weight += momentum # update weights/parameters
        pred_2 = f(weight)
        if abs(pred_2 - pred_1) < 1e-13: # Convergence tolerance
            print(f"Converged on {i+1}th iteration")
            break
        pred_1 = pred_2

    print("Min x (weight):",weight)
    print("At x (weight), y =",f(weight))
    print("Gradient at x (weight):",f_prime(weight))
    
    return weight, f(weight), f_prime(weight) 

In [402]:
for lr in learning_rates:
    print(f"---Results for learning rate = {lr}---\n")
    vanilla_momentum_optimizer(f, f_prime, learning_rate=lr)
    print()

---Results for learning rate = 0.1---

Converged on 272th iteration
Min x (weight): 2.617801423786491
At x (weight), y = -1.2981722729938
Gradient at x (weight): 2.845743499957365e-06

---Results for learning rate = 0.01---

Converged on 248th iteration
Min x (weight): 2.6177986291129005
At x (weight), y = -1.2981722729612328
Gradient at x (weight): -2.6152142039492787e-05

---Results for learning rate = 0.001---

Converged on 1621th iteration
Min x (weight): 2.6178012818908893
At x (weight), y = -1.2981722729940992
Gradient at x (weight): 1.3734145909438666e-06



## Nesterov accelerated gradient

In [403]:
def nesterov_accelerated_optimizer(f, f_prime, max_iters=5000, learning_rate=0.001, initial_guess = None):
    if initial_guess is None:
        initial_guess = 0.75
    weight = initial_guess
    pred_1 = f(weight)
    beta = 0.9 # set the friction
    momentum = 0 # initialize velocity
    
    for i in range(max_iters):
        grad = f_prime(weight + beta * momentum) # Compute gradient
        momentum = beta * momentum - learning_rate * grad
        weight += momentum # update weights/parameters
        pred_2 = f(weight)
        if abs(pred_2 - pred_1) < 1e-13: # Convergence tolerance
            print(f"Converged on {i+1}th iteration")
            break
        pred_1 = pred_2

    print("Min x (weight):",weight)
    print("At x (weight), y =",f(weight))
    print("Gradient at x (weight):",f_prime(weight))
    
    return weight, f(weight), f_prime(weight) 

In [404]:
for lr in learning_rates:
    print(f"---Results for learning rate = {lr}---\n")
    nesterov_accelerated_optimizer(f, f_prime, learning_rate=lr)
    print()

---Results for learning rate = 0.1---

Converged on 16th iteration
Min x (weight): 2.6178011575928775
At x (weight), y = -1.2981722729941898
Gradient at x (weight): 8.368096893196508e-08

---Results for learning rate = 0.01---

Converged on 150th iteration
Min x (weight): 2.6178011268972963
At x (weight), y = -1.2981722729941874
Gradient at x (weight): -2.3482066258129208e-07

---Results for learning rate = 0.001---

Converged on 1606th iteration
Min x (weight): 2.6178010096864353
At x (weight), y = -1.2981722729940885
Gradient at x (weight): -1.4510167992698442e-06



## Adaptive Gradient Algorithm (AdaGrad)

In [405]:
def adagrad_optimizer(f, f_prime, max_iters=5000, learning_rate=0.001, initial_guess = None):
    if initial_guess is None:
        initial_guess = 0.75
    weight = initial_guess
    pred_1 = f(weight)
    s = 0  # Initialize s
    epsilon = 1e-8 # regularization parameter to avoid division by zero
    
    for i in range(max_iters):
        grad = f_prime(weight) # Compute gradient
        s += grad * grad
        weight = weight - learning_rate * (grad/np.sqrt(s + epsilon)) # update weights/parameters
        pred_2 = f(weight)
        if abs(pred_2 - pred_1) < 1e-13: # Convergence tolerance
            print(f"Converged on {i+1}th iteration")
            break
        pred_1 = pred_2

    print("Min x (weight):",weight)
    print("At x (weight), y =",f(weight))
    print("Gradient at x (weight):",f_prime(weight))
    
    return weight, f(weight), f_prime(weight) 

In [406]:
for lr in learning_rates:
    print(f"---Results for learning rate = {lr}---\n")
    adagrad_optimizer(f, f_prime, learning_rate=lr)
    print()

---Results for learning rate = 0.1---

Converged on 1125th iteration
Min x (weight): 2.6178008423934918
At x (weight), y = -1.2981722729937006
Gradient at x (weight): -3.1868712790927844e-06

---Results for learning rate = 0.01---

Min x (weight): 1.4891060712514816
At x (weight), y = 0.0024870379181773927
Gradient at x (weight): -0.08298058593144297

---Results for learning rate = 0.001---

Min x (weight): 0.8946785539161752
At x (weight), y = 0.7649671358421188
Gradient at x (weight): -2.2419737465025835



AdaGrad only started converging after setting a learning rate of 0.1. With the default learning rate of 0.01, the adjusted learning rate becomes extremely small after a few iterations, leading to very slow updates or no noticeable progress.

## Root Mean Square Propagation (RMSProp)

In [407]:
def rmsprop_optimizer(f, f_prime, max_iters=5000, learning_rate=0.001, initial_guess = None):
    if initial_guess is None:
        initial_guess = 0.75
    
    weight = initial_guess # initialize initial guess
    pred_1 = f(weight)
    
    beta = 0.9 # decay rate
    s = 0  # Initialize s
    epsilon = 1e-8 # regularization parameter to avoid division by zero
    
    for i in range(max_iters):
        grad = f_prime(weight) # Compute gradient
        s = s * beta + (1 - beta) * grad * grad
        weight = weight - learning_rate * (grad/np.sqrt(s + epsilon)) # update weights/parameters
        pred_2 = f(weight)
        if abs(pred_2 - pred_1) < 1e-13:
            print(f"Converged on {i+1}th iteration")
            break
        pred_1 = pred_2

    print("Min x (weight):", weight)
    print("At x (weight), y =", f(weight))
    print("Gradient at x (weight):", f_prime(weight))
    
    return weight, f(weight), f_prime(weight) 

In [408]:
for lr in learning_rates:
    print(f"---Results for learning rate = {lr}---\n")
    rmsprop_optimizer(f, f_prime, learning_rate=lr)
    print()

---Results for learning rate = 0.1---

Min x (weight): 2.6663394448293567
At x (weight), y = -1.2857500845016225
Gradient at x (weight): 0.5153434719103737

---Results for learning rate = 0.01---

Converged on 262th iteration
Min x (weight): 2.617801129097367
At x (weight), y = -1.298172272994188
Gradient at x (weight): -2.1199242150604647e-07

---Results for learning rate = 0.001---

Converged on 1933th iteration
Min x (weight): 2.6178010995757988
At x (weight), y = -1.2981722729941771
Gradient at x (weight): -5.183123216179197e-07



## Adaptive Moment Estimation (Adam)