/
adagrad_optimizer.py
89 lines (69 loc) · 3.11 KB
/
adagrad_optimizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# Copyright (C) 2020-2023 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
"""Adagrad optimizer module."""
from typing import Dict
from typing import Optional
import numpy as np
from .base_optimizer import Optimizer
class NumPyAdagrad(Optimizer):
"""Adagrad optimizer implementation.
Original paper: http://jmlr.org/papers/v12/duchi11a.html
"""
def __init__(
self,
*,
params: Optional[Dict[str, np.ndarray]] = None,
model_interface=None,
learning_rate: float = 0.01,
initial_accumulator_value: float = 0.1,
epsilon: float = 1e-10,
) -> None:
"""Initialize.
Args:
params: Parameters to be stored for optimization.
model_interface: Model interface instance to provide parameters.
learning_rate: Tuning parameter that determines
the step size at each iteration.
initial_accumulator_value: Initial value for squared gradients.
epsilon: Value for computational stability.
"""
super().__init__()
if model_interface is None and params is None:
raise ValueError('Should provide one of the params or model_interface')
if learning_rate < 0:
raise ValueError(
f'Invalid learning rate: {learning_rate}. Learning rate must be >= 0.')
if initial_accumulator_value < 0:
raise ValueError(
f'Invalid initial_accumulator_value value: {initial_accumulator_value}.'
'Initial accumulator value must be >= 0.')
if epsilon <= 0:
raise ValueError(
f'Invalid epsilon value: {epsilon}. Epsilon avalue must be > 0.')
self.params = params
if params is None and model_interface is not None:
self._set_params_from_model(model_interface)
self.learning_rate = learning_rate
self.initial_accumulator_value = initial_accumulator_value
self.epsilon = epsilon
self.grads_squared = {}
for param_name in self.params:
self.grads_squared[param_name] = np.full_like(self.params[param_name],
self.initial_accumulator_value)
def _update_param(self, grad_name: str, grad: np.ndarray) -> None:
"""Update papams by given gradients."""
self.params[grad_name] -= (self.learning_rate * grad
/ (np.sqrt(self.grads_squared[grad_name]) + self.epsilon))
def step(self, gradients: Dict[str, np.ndarray]) -> None:
"""
Perform a single step for parameter update.
Implement Adagrad optimizer weights update rule.
Args:
gradients: Partial derivatives with respect to optimized parameters.
"""
for grad_name in gradients:
if grad_name not in self.grads_squared:
raise KeyError(f"Key {grad_name} doesn't exist in optimized parameters")
grad = gradients[grad_name]
self.grads_squared[grad_name] = self.grads_squared[grad_name] + grad**2
self._update_param(grad_name, grad)