Add Max-Boltzman policy, combination of eps-greedy and Boltzman (#122)

* Add Max-Boltzman policy, combination of eps-greedy and boltzman Wiering, M.: Explorations in Efficient Reinforcement Learning. PhD thesis, University of Amserdam, Amsterdam (1999) * add reference
keras-rl · Nov 30, 2017 · bb79f78 · bb79f78
1 parent 09ca76a
commit bb79f78
Showing 1 changed file with 36 additions and 0 deletions.
diff --git a/rl/policy.py b/rl/policy.py
@@ -121,6 +121,42 @@ def get_config(self):
         return config
 
 
+class MaxBoltzmannQPolicy(Policy):
+    """
+    A combination of the eps-greedy and Boltzman q-policy.
+
+    Wiering, M.: Explorations in Efficient Reinforcement Learning.
+    PhD thesis, University of Amserdam, Amsterdam (1999)
+
+    https://pure.uva.nl/ws/files/3153478/8461_UBA003000033.pdf
+    """
+    def __init__(self, eps=.1, tau=1., clip=(-500., 500.)):
+        super(MaxBoltzmannQPolicy, self).__init__()
+        self.eps = eps
+        self.tau = tau
+        self.clip = clip
+
+    def select_action(self, q_values):
+        assert q_values.ndim == 1
+        q_values = q_values.astype('float64')
+        nb_actions = q_values.shape[0]
+
+        if np.random.uniform() < self.eps:
+            exp_values = np.exp(np.clip(q_values / self.tau, self.clip[0], self.clip[1]))
+            probs = exp_values / np.sum(exp_values)
+            action = np.random.choice(range(nb_actions), p=probs)
+        else:
+            action = np.argmax(q_values)
+        return action
+
+    def get_config(self):
+        config = super(MaxBoltzmannQPolicy, self).get_config()
+        config['eps'] = self.eps
+        config['tau'] = self.tau
+        config['clip'] = self.clip
+        return config
+
+
 class BoltzmannGumbelQPolicy(Policy):
     """Implements Boltzmann-Gumbel exploration (BGE) adapted for Q learning
     based on the paper Boltzmann Exploration Done Right