Import neccessary library

In [2]:
import numpy as np

Define a AdaGrad class which construct by mathematic formula

In [3]:
class Adagrad:
    
    def __init__(self, learning_rate: float = 0.01, constant: float = 1e-8):
        self.learning_rate = learning_rate
        self.constant = constant

        # s(i,t)
        self.accumulated_gradients = None

        
    def calculate_param(self, params: np.ndarray, gradients: np.ndarray):
        if self.accumulated_gradients is None:
            self.accumulated_gradients = np.zeros_like(params)
        
        self.accumulated_gradients += gradients ** 2
        
        # Compute adaptive learning rate
        adaptive_lr = self.learning_rate / (np.sqrt(self.accumulated_gradients) + self.constant)
        
        # Update parameters
        updated_params = params - adaptive_lr * gradients
        
        return updated_params
    
    def get_accumulated_gradients(self):
        if self.accumulated_gradients is not None:
            return self.accumulated_gradients
        return None

Provide a function for testing and visualize it

In [13]:
if __name__ == "__main__":
    
    # Initial parameters
    params = np.array([0.0, 0.0])
    optimizer = Adagrad(learning_rate=0.1)
    print("Function: ")
    print("f(x,y) = 10*(x)^2 + (y + 10)^2")
    print("=" * 55)
    print("\nStep | x      | y      | Loss   | Adaptive LR (x,y)")
    print("-" * 55)
    
    for step in range(50):
        x, y = params[0], params[1]
        
        loss = 5*(x + 50)**3 + (y + 10)**2
        gradients = np.array([15*(x + 5), 2*(y + 10)])
        
        # Show adaptive learning rates
        if optimizer.accumulated_gradients is not None:
            # Calculate the learning rate
            adaptive_lr = optimizer.learning_rate / (np.sqrt(optimizer.accumulated_gradients) + optimizer.constant)
            lr_str = f"({adaptive_lr[0]:.3f},{adaptive_lr[1]:.3f})"
        else:
            lr_str = "(init,init)"
        
        if step % 4 == 0:
            print(f"{step:4d} | {x:6.3f} | {y:6.3f} | {loss:6.3f} | {lr_str}")
        
        # Update parameters
        params = optimizer.calculate_param(params, gradients)
        
        if loss < 1e-6:
            break
    
    print("We could see how learning rate adapt through different gradient level")


Function: 
f(x,y) = 10*(x)^2 + (y + 10)^2

Step | x      | y      | Loss   | Adaptive LR (x,y)
-------------------------------------------------------
   0 |  0.000 |  0.000 | 625100.000 | (init,init)
   4 | -0.276 | -0.277 | 614811.867 | (0.001,0.003)
   8 | -0.430 | -0.433 | 609116.662 | (0.000,0.002)
  12 | -0.549 | -0.555 | 604743.723 | (0.000,0.001)
  16 | -0.649 | -0.658 | 601081.553 | (0.000,0.001)
  20 | -0.736 | -0.748 | 597881.238 | (0.000,0.001)
  24 | -0.815 | -0.830 | 595012.175 | (0.000,0.001)
  28 | -0.887 | -0.905 | 592395.729 | (0.000,0.001)
  32 | -0.954 | -0.975 | 589980.203 | (0.000,0.001)
  36 | -1.017 | -1.040 | 587729.456 | (0.000,0.001)
  40 | -1.075 | -1.102 | 585617.046 | (0.000,0.001)
  44 | -1.131 | -1.160 | 583622.934 | (0.000,0.001)
  48 | -1.184 | -1.216 | 581731.504 | (0.000,0.001)
We could see how learning rate adapt through different gradient level
