Define Adam implementation base on mathematic formula

In [37]:
import numpy as np
from EMA_class import EMA

class Adam:
    
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon

        # State variables
        self.v = EMA(1- self.beta1, bias_correction=True)  # First moment estimate (momentum)
        self.s = EMA(1 -self.beta2, bias_correction=True)  # Second moment estimate (variance)


    def momentum_vector(self, params, gradients):
        if isinstance(params, EMA):
            return params.calculate(gradients)
    

        
    def calculate(self, params, gradients):
        # Initialize state on first call
        v_hat = self.momentum_vector(self.v, gradients)
        s_hat = self.momentum_vector(self.s, gradients**2)
        rescaled_gradient = self.learning_rate * v_hat / (np.sqrt(s_hat) + self.epsilon)
        new_params = params - rescaled_gradient
        
        return new_params

Additional improvement within Yogi

In [38]:
class Adam_Yogi:
    
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        
        # State variables
        self.v = None # First moment estimate (momentum)
        self.s = None  # Second moment estimate (variance)
        self.t = 0     # Time step counter

    

    def calculate(self, params, gradients):
        # Initialize state on first call
        if self.v is None:
            self.v = np.zeros_like(params)
            self.s = np.zeros_like(params)
        
        # Increment time step
        self.t += 1
        self.v = self.v = self.beta1 * self.v + (1 - self.beta1) * gradients
        self.s = self.s + (1 - self.beta2) * np.sign(gradients**2 - self.s) * gradients**2 # Yogi improvement
        v_hat = self.v / (1 - self.beta1**self.t)
        s_hat = self.s / (1 - self.beta2**self.t)
        rescaled_gradient = self.learning_rate * v_hat / (np.sqrt(s_hat) + self.epsilon)
        new_params = params - rescaled_gradient
        
        return new_params
    

Provide some function for testing

In [39]:
def calculate_gradient(optimize_algorithm, params):
    for step in range(25):
        x, y = params[0], params[1]
        
        # Compute loss and gradients for f(x,y) = (x-1)^2 + (y-2)^2
        loss = (x - 1)**2 + (y - 2)**2
        gradients = np.array([2*(x - 1), 2*(y - 2)])
        
        if optimize_algorithm.v is not None:
            if isinstance(optimize_algorithm, Adam):
                v_hat = optimize_algorithm.momentum_vector(optimize_algorithm.v, gradients)
                s_hat = optimize_algorithm.momentum_vector(optimize_algorithm.s, gradients**2)
            else:
                v_hat = optimize_algorithm.v / (1 - optimize_algorithm.beta1**optimize_algorithm.t)
                s_hat = optimize_algorithm.s / (1 - optimize_algorithm.beta2**optimize_algorithm.t)
            v_hat_str = f"({v_hat[0]:6.3f},{v_hat[1]:6.3f})"
            s_hat_str = f"({s_hat[0]:6.3f},{s_hat[1]:6.3f})"
        else:
            v_hat_str = "(  init,  init)"
            s_hat_str = "(  init,  init)"
        
        if step % 5 == 0:
            print(f"{step:4d} | {x:6.3f} | {y:6.3f} | {loss:6.4f} | {v_hat_str} | {s_hat_str}")
        
        params = optimize_algorithm.calculate(params, gradients)
        
        if loss < 1e-8:
            break

    print(f"\nFinal: x={params[0]:.6f}, y={params[1]:.6f}")
    print(f"Final loss: {((params[0]-1)**2 + (params[1]-2)**2):.10f}")

In [43]:
if __name__ == "__main__":
    params = np.array([0.0, 0.0])
    optimizer = Adam(learning_rate=0.1, beta1=0.9, beta2=0.999)
    optimizer_yogi = Adam_Yogi(learning_rate=0.1, beta1=0.9, beta2=0.999)

    print("Adam Optimizer Demo")
    print("=" * 60)

    print("Optimizing f(x,y) = (x-1)^2 + (y-2)^2 with Adam")
    print("\nStep | x      | y      | Loss   | first-moment estimate  | second-moment estimate")
    print("-" * 65)

    calculate_gradient(optimizer, params)
    print("\n")
    print("Adam Yogi Optimizer Demo")
    print("=" * 60)

    print("Optimizing f(x,y) = (x-1)^2 + (y-2)^2 with Adam and Yogi improvement")
    print("\nStep | x      | y      | Loss   | first-moment estimate  | second-moment estimate")
    print("-" * 65)

    calculate_gradient(optimizer_yogi, params)
    
    
    
    
    

Adam Optimizer Demo
Optimizing f(x,y) = (x-1)^2 + (y-2)^2 with Adam

Step | x      | y      | Loss   | first-moment estimate  | second-moment estimate
-----------------------------------------------------------------
   0 |  0.000 |  0.000 | 5.0000 | (-20.000,-40.000) | (4000.000,16000.000)
   5 |  0.088 |  0.089 | 4.4834 | (-2.809,-5.723) | (365.118,1461.139)
  10 |  0.161 |  0.164 | 4.0745 | (-2.026,-4.269) | (191.763,768.254)
  15 |  0.236 |  0.244 | 3.6675 | (-1.731,-3.801) | (130.092,522.030)
  20 |  0.312 |  0.330 | 3.2604 | (-1.538,-3.544) | (98.391,395.633)

Final: x=0.390039, y=0.421850
Final loss: 2.8626104041


Adam Yogi Optimizer Demo
Optimizing f(x,y) = (x-1)^2 + (y-2)^2 with Adam and Yogi improvement

Step | x      | y      | Loss   | first-moment estimate  | second-moment estimate
-----------------------------------------------------------------
   0 |  0.000 |  0.000 | 5.0000 | (  init,  init) | (  init,  init)
   5 |  0.492 |  0.497 | 2.5180 | (-1.561,-3.559) | ( 2.652