Define Adam implementation base on mathematic formula

In [1]:
import numpy as np

class Adam:
    
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        
        # State variables
        self.v = None  # First moment estimate (momentum)
        self.s = None  # Second moment estimate (variance)
        self.t = 0     # Time step counter
        
    def update(self, params, gradients):
        # Initialize state on first call
        if self.v is None:
            self.v = np.zeros_like(params)
            self.s = np.zeros_like(params)
        
        # Increment time step
        self.t += 1
        

        self.v = self.beta1 * self.v + (1 - self.beta1) * gradients
        self.s = self.beta2 * self.s + (1 - self.beta2) * gradients**2
        v_hat = self.v / (1 - self.beta1**self.t)
        s_hat = self.s / (1 - self.beta2**self.t)
        rescaled_gradient = self.learning_rate * v_hat / (np.sqrt(s_hat) + self.epsilon)
        new_params = params - rescaled_gradient
        
        return new_params

Provide some function for testing

In [2]:
# Example usage and demonstration
if __name__ == "__main__":
    print("Adam Optimizer Demo")
    print("=" * 60)
    
    # Example 1: Simple 2D quadratic optimization
    print("\nExample 1: Optimizing f(x,y) = (x-1)^2 + (y-2)^2")
    print("Target: x=1, y=2")
    
    # Initial parameters
    params = np.array([0.0, 0.0])
    optimizer = Adam(learning_rate=0.1, beta1=0.9, beta2=0.999)
    
    print("\nStep | x      | y      | Loss   | v̂ (x,y)     | ŝ (x,y)")
    print("-" * 65)
    
    for step in range(25):
        x, y = params[0], params[1]
        
        # Compute loss and gradients for f(x,y) = (x-1)^2 + (y-2)^2
        loss = (x - 1)**2 + (y - 2)**2
        gradients = np.array([2*(x - 1), 2*(y - 2)])
        
        # Show bias-corrected estimates for analysis
        if optimizer.v is not None and optimizer.t > 0:
            v_hat = optimizer.v / (1 - optimizer.beta1**optimizer.t)
            s_hat = optimizer.s / (1 - optimizer.beta2**optimizer.t)
            v_hat_str = f"({v_hat[0]:6.3f},{v_hat[1]:6.3f})"
            s_hat_str = f"({s_hat[0]:6.3f},{s_hat[1]:6.3f})"
        else:
            v_hat_str = "(  init,  init)"
            s_hat_str = "(  init,  init)"
        
        if step % 5 == 0:
            print(f"{step:4d} | {x:6.3f} | {y:6.3f} | {loss:6.4f} | {v_hat_str} | {s_hat_str}")
        
        # Update parameters
        params = optimizer.update(params, gradients)
        
        if loss < 1e-8:
            break
    
    print(f"\nFinal: x={params[0]:.6f}, y={params[1]:.6f}")
    print(f"Final loss: {((params[0]-1)**2 + (params[1]-2)**2):.10f}")
    
    
    print("\nAdam combines the benefits of:")
    print("- Momentum (helps navigate ravines and accelerates convergence)")
    print("- Adaptive learning rates (handles different parameter scales)")
    print("- Bias correction (prevents initial steps from being too small)")

Adam Optimizer Demo

Example 1: Optimizing f(x,y) = (x-1)^2 + (y-2)^2
Target: x=1, y=2

Step | x      | y      | Loss   | v̂ (x,y)     | ŝ (x,y)
-----------------------------------------------------------------
   0 |  0.000 |  0.000 | 5.0000 | (  init,  init) | (  init,  init)
   5 |  0.492 |  0.497 | 2.5169 | (-1.561,-3.559) | ( 2.645,13.043)
  10 |  0.924 |  0.975 | 1.0556 | (-0.967,-2.942) | ( 1.565, 9.978)
  15 |  1.202 |  1.412 | 0.3871 | (-0.403,-2.278) | ( 1.054, 7.614)
  20 |  1.271 |  1.778 | 0.1230 | ( 0.018,-1.607) | ( 0.850, 5.899)

Final: x=1.180554, y=2.046737
Final loss: 0.0347842548

Adam combines the benefits of:
- Momentum (helps navigate ravines and accelerates convergence)
- Adaptive learning rates (handles different parameter scales)
- Bias correction (prevents initial steps from being too small)
