Define the AdaDelta Class base on the mathematic formula

In [2]:
import numpy as np

class AdaDelta:
    def __init__(self, rho=0.95, constant=1e-6):
        self.rho = rho
        self.constant = constant
        self.s = None 
        self.delta_x = None  
        
    def update(self, params, gradients):
        if self.s is None:
            self.s = np.zeros_like(params)
            self.delta_x = np.zeros_like(params)
        
    
        self.s = self.rho * self.s + (1 - self.rho) * gradients**2
        rescaled_gradient = (np.sqrt(self.delta_x + self.constant) / 
                           np.sqrt(self.s + self.constant)) * gradients
        

        new_params = params - rescaled_gradient
        
        
        param_update = rescaled_gradient 
        self.delta_x = self.rho * self.delta_x + (1 - self.rho) * param_update**2
        
        return new_params

Provide some function for testing

In [None]:
if __name__ == "__main__":
    print("AdaDelta Optimizer Demo")
    print("\nOptimizing f(x,y) = (x-2)^2 + (y-3)^2")
    print("=" * 55)
    
    
    
    # Initial parameters
    params = np.array([0.0, 0.0])
    optimizer = AdaDelta(rho=0.95, constant=1e-6)
    
    print("\nStep | x      | y      | Loss   | RMS[g] (x,y) | RMS[Δx] (x,y)")
    print("-" * 70)
    
    for step in range(15):
        x, y = params[0], params[1]
        
        # Compute loss and gradients for f(x,y) = (x-2)^2 + (y-3)^2
        loss = (x - 2)**2 + (y - 3)**2
        gradients = np.array([2*(x - 2), 2*(y - 3)])
        
        if optimizer.s is not None:
            rms_g = np.sqrt(optimizer.s + optimizer.constant)
            rms_dx = np.sqrt(optimizer.delta_x + optimizer.constant)
            rms_g_str = f"({rms_g[0]:.3f},{rms_g[1]:.3f})"
            rms_dx_str = f"({rms_dx[0]:.3f},{rms_dx[1]:.3f})"
        else:
            rms_g_str = "(init,init)"
            rms_dx_str = "(init,init)"
        
        print(f"{step:4d} | {x:6.3f} | {y:6.3f} | {loss:6.3f} | {rms_g_str:12} | {rms_dx_str}")
        
        # Update parameters
        params = optimizer.update(params, gradients)
        
        if loss < 1e-8:
            break
    
    print(f"\nFinal: x={params[0]:.6f}, y={params[1]:.6f}")
    print(f"Final loss: {((params[0]-2)**2 + (params[1]-3)**2):.10f}")
    

AdaDelta Optimizer Demo

Optimizing f(x,y) = (x-2)^2 + (y-3)^2

Step | x      | y      | Loss   | RMS[g] (x,y) | RMS[Δx] (x,y)
----------------------------------------------------------------------
   0 |  0.000 |  0.000 | 13.000 | (init,init)  | (init,init)
   1 |  0.004 |  0.004 | 12.955 | (0.894,1.342) | (0.001,0.001)
   2 |  0.009 |  0.009 | 12.910 | (1.248,1.872) | (0.002,0.002)
   3 |  0.014 |  0.014 | 12.865 | (1.507,2.262) | (0.002,0.002)
   4 |  0.018 |  0.018 | 12.819 | (1.717,2.578) | (0.002,0.002)
   5 |  0.023 |  0.023 | 12.774 | (1.893,2.845) | (0.002,0.002)
   6 |  0.027 |  0.027 | 12.728 | (2.046,3.076) | (0.003,0.003)
   7 |  0.032 |  0.032 | 12.682 | (2.181,3.279) | (0.003,0.003)
   8 |  0.037 |  0.037 | 12.637 | (2.301,3.461) | (0.003,0.003)
   9 |  0.041 |  0.041 | 12.591 | (2.408,3.624) | (0.003,0.003)
  10 |  0.046 |  0.046 | 12.545 | (2.505,3.772) | (0.003,0.003)
  11 |  0.050 |  0.051 | 12.500 | (2.594,3.907) | (0.003,0.003)
  12 |  0.055 |  0.055 | 12.454 | (2.