Merge 290b4cc into e6ef6d7

ishikota · Nov 26, 2016 · bb684c5 · bb684c5
2 parents e6ef6d7 + 290b4cc
commit bb684c5
Show file tree

Hide file tree

Showing 24 changed files with 582 additions and 32 deletions.
diff --git a/kyoka/algorithm/deep_q_learning.py b/kyoka/algorithm/deep_q_learning.py
@@ -78,7 +78,7 @@ def _gen_backup_data(self, task, greedy_policy, value_function, experience):
     def _gen_replay_memory_save_path(self, dir_path):
         return os.path.join(dir_path, self.SAVE_FILE_NAME)
 
-class BaseDeepQLearningApproxActionValueFunction(BaseApproxActionValueFunction):
+class DeepQLearningApproxActionValueFunction(BaseApproxActionValueFunction):
 
     def initialize_network(self):
         err_msg = build_not_implemented_msg(self, "initialize_network")
@@ -170,6 +170,6 @@ def predict_value(value_function, next_state, next_action):
 
 def validate_value_function(value_function):
     value_function_check("DeepQLearning",\
-            [BaseDeepQLearningApproxActionValueFunction],\
+            [DeepQLearningApproxActionValueFunction],\
             value_function)
 
diff --git a/kyoka/algorithm/montecarlo.py b/kyoka/algorithm/montecarlo.py
@@ -6,6 +6,9 @@
 
 class MonteCarlo(BaseRLAlgorithm):
 
+    def __init__(self, gamma=1):
+        self.gamma = gamma
+
     def setup(self, task, policy, value_function):
         validate_value_function(value_function)
         super(MonteCarlo, self).setup(task, policy, value_function)
@@ -20,7 +23,10 @@ def run_gpi_for_an_episode(self, task, policy, value_function):
     def _calculate_following_state_reward(self, current_turn, episode):
         following_turn_info = episode[current_turn:]
         following_reward = [reward for _, _, _, reward in following_turn_info]
-        return sum(following_reward)
+        return sum([self.__discount(step, reward) for step, reward in enumerate(following_reward)])
+
+    def __discount(self, step, reward):
+        return self.gamma ** step * reward
 
 class MonteCarloTabularActionValueFunction(BaseTabularActionValueFunction):
 
@@ -56,11 +62,11 @@ def _calc_average_in_incremental_way(self, k, r, Q):
     def _gen_update_counter_file_path(self, dir_path):
         return os.path.join(dir_path, self.SAVE_FILE_NAME)
 
-class BaseMonteCarloApproxActionValueFunction(BaseApproxActionValueFunction):
+class MonteCarloApproxActionValueFunction(BaseApproxActionValueFunction):
     pass
 
 def validate_value_function(value_function):
     value_function_check("MonteCarlo",\
-            [MonteCarloTabularActionValueFunction, BaseMonteCarloApproxActionValueFunction],\
+            [MonteCarloTabularActionValueFunction, MonteCarloApproxActionValueFunction],\
             value_function)
 
diff --git a/kyoka/algorithm/q_learning.py b/kyoka/algorithm/q_learning.py
@@ -39,7 +39,7 @@ def backup(self, state, action, backup_target, alpha):
         new_Q_value = Q_value + alpha * (backup_target - Q_value)
         self.insert_value_into_table(self.table, state, action, new_Q_value)
 
-class BaseQLearningApproxActionValueFunction(BaseApproxActionValueFunction):
+class QLearningApproxActionValueFunction(BaseApproxActionValueFunction):
     pass
 
 ACTION_ON_TERMINAL_FLG = "action_on_terminal"
@@ -58,6 +58,6 @@ def predict_value(value_function, next_state, next_action):
 
 def validate_value_function(value_function):
     value_function_check("QLearning",\
-            [QLearningTabularActionValueFunction, BaseQLearningApproxActionValueFunction],\
+            [QLearningTabularActionValueFunction, QLearningApproxActionValueFunction],\
             value_function)
 
diff --git a/kyoka/algorithm/sarsa.py b/kyoka/algorithm/sarsa.py
@@ -36,7 +36,7 @@ def backup(self, state, action, backup_target, alpha):
         new_Q_value = Q_value + alpha * (backup_target - Q_value)
         self.insert_value_into_table(self.table, state, action, new_Q_value)
 
-class BaseSarsaApproxActionValueFunction(BaseApproxActionValueFunction):
+class SarsaApproxActionValueFunction(BaseApproxActionValueFunction):
     pass
 
 ACTION_ON_TERMINAL_FLG = "action_on_terminal"
@@ -55,6 +55,6 @@ def predict_value(value_function, next_state, next_action):
 
 def validate_value_function(value_function):
     value_function_check("Sarsa",\
-            [SarsaTabularActionValueFunction, BaseSarsaApproxActionValueFunction],\
+            [SarsaTabularActionValueFunction, SarsaApproxActionValueFunction],\
             value_function)
 
diff --git a/sample/maze/README.md b/sample/maze/README.md
@@ -0,0 +1,66 @@
+# Maze Task example
+Simple testbet for reinforcement learning algorithms.  
+The goal of agent is to escape from the maze in minimum steps.  
+
+## Sample mazes
+We prepared 3 kinds of mazes which are used in the book [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/the-book-2nd.html)  
+###1. Dyna Maze
+The most simple test bet.  
+`S` indicates the start point of maze, `G` is the goal and `X` is the block(agent cannot move on block cell).
+```
+-------XG
+--X----X-
+S-X----X-
+--X------
+-----X---
+---------
+```
+
+###2. Blocking Maze
+After some steps the structure of maze transforms. In transforming, past best path is blocked.  
+So agent should realize that maze was transformed and learn another path.
+
+```
+ Before         After
+--------G     --------G
+---------     ---------
+---------  => ---------
+XXXXXXXX-     -XXXXXXXX
+---------     ---------
+---S-----     ---S-----
+```
+
+###3. Shortcut Maze
+After some steps the structure of maze transforms. In transforming, new best path(shortcut) appears.
+So agent should realize that the better path is appeared by exploring. 
+```
+ Before         After
+--------G     --------G
+---------     ---------
+---------  => ---------
+-XXXXXXXX     -XXXXXXX-
+---------     ---------
+---S-----     ---S-----
+```
+
+## Sample code
+We prepared sample code to try these mazes by RL algorithms.  
+You can checkout sample code under [script](./script) directory.  
+
+If you want to try `Dyna Maze` by `montecarlo` method, run `python sample/maze/script/dyna_maze/montecarlo.py`  
+After training is finished, the policy which agent learned would be visualized on console like below.
+```
+>>> python sample/maze/script/dyna_maze/montecarlo.py
+[Progress] Start GPI iteration for 100 times
+(some logs are output...)
+[Progress] Completed GPI iteration for 100 times. (total time: 0s)
+[MazePerformanceWatcher] Policy which agent learned is like this.
+v>>>vvv-G
+v^-^>vv-^
+>v->>>v-^
+vv->^^>>^
+>v<^<-^>^
+^>>>>>^>^
+
+```
+
diff --git a/sample/maze/helper.py b/sample/maze/helper.py
@@ -21,6 +21,14 @@ def visualize_policy(task, value_function):
     visualized_actions = [[flg2icon(flg) for flg in line] for line in actions]
     return visualize_maze(["".join(line) for line in visualized_actions])
 
+def construct_features(task, state, action):
+    w, h = task.get_maze_shape()
+    next_state = task.transit_state(state, action)
+    onehot = [[1 if next_state == (row, col) else 0 for col in range(h)] for row in range(w)]
+    return _flatten(onehot)
+
+def _flatten(table):
+    return [item for sublist in table for item in sublist]
 
 def _find_best_actions_on_each_cell(task, value_function):
     height, width = task.get_maze_shape()

diff --git a/sample/maze/script/blocking_maze/montecarlo.py b/sample/maze/script/blocking_maze/montecarlo.py
@@ -54,7 +54,7 @@ def insert_value_into_table(self, table, state, action, new_value):
 transfomer.set_transformation(50, TRANSFORMATION_FILE_PATH)
 callbacks.append(transfomer)
 
-algorithm = MonteCarlo()
+algorithm = MonteCarlo(gamma=0.1)
 algorithm.setup(task, policy, value_func)
 algorithm.run_gpi(TEST_LENGTH, callbacks=callbacks)
 
diff --git a/sample/maze/script/dyna_maze/deep_q_learning.py b/sample/maze/script/dyna_maze/deep_q_learning.py
@@ -0,0 +1,72 @@
+#!/usr/local/bin/python
+
+# Resolve path configucation
+import os
+import sys
+import argparse
+
+root = os.path.join(os.path.dirname(__file__), "../"*4)
+src_path = os.path.join(root, "kyoka")
+sample_path = os.path.join(root, "sample")
+sys.path.append(root)
+sys.path.append(src_path)
+sys.path.append(sample_path)
+
+import numpy as np
+from keras.models import Sequential
+from keras.layers.core import Dense
+
+import sample.maze.helper as Helper
+from sample.maze.task import MazeTask
+from sample.maze.callback import MazePerformanceWatcher
+
+from kyoka.algorithm.deep_q_learning import DeepQLearning, DeepQLearningApproxActionValueFunction
+from kyoka.policy import EpsilonGreedyPolicy
+
+class MazeApproxActionValueFunction(DeepQLearningApproxActionValueFunction):
+
+    def __init__(self, task):
+        super(MazeApproxActionValueFunction, self).__init__()
+        self.task = task
+
+    def initialize_network(self):
+        maze_shape = self.task.get_maze_shape()
+        input_dim = maze_shape[0] * maze_shape[1]
+        model = Sequential()
+        model.add(Dense(1, input_dim=input_dim))
+        model.compile(loss="mse",  optimizer="adam")
+        return model
+
+    def deepcopy_network(self, q_network):
+        q_hat_network = self.initialize_network()
+        for original_layer, copy_layer in zip(q_network.layers, q_hat_network.layers):
+            copy_layer.set_weights(original_layer.get_weights())
+        return q_hat_network
+
+    def predict_value_by_network(self, network, state, action):
+        features = self.construct_features(state, action)
+        return network.predict_on_batch(np.array([features]))[0][0]
+
+    def backup_on_minibatch(self, q_network, backup_minibatch):
+        minibatch = [(self.construct_features(state, action), target) for state, action, target in backup_minibatch]
+        X = np.array([x for x, _ in minibatch])
+        y = np.array([y for _, y in minibatch])
+        loss = q_network.train_on_batch(X, y)
+
+    def construct_features(self, state, action):
+        return Helper.construct_features(self.task, state, action)
+
+MAZE_FILE_PATH = os.path.join(os.path.dirname(__file__), "dyna.txt")
+
+task = MazeTask()
+task.read_maze(MAZE_FILE_PATH)
+value_func = MazeApproxActionValueFunction(task)
+
+TEST_LENGTH = 100
+policy = EpsilonGreedyPolicy(eps=0.1)
+policy.set_eps_annealing(1.0, 0.1, 50)
+callbacks = [MazePerformanceWatcher()]
+algorithm = DeepQLearning(N=100, C=100, minibatch_size=32, replay_start_size=50)
+algorithm.setup(task, policy, value_func)
+algorithm.run_gpi(TEST_LENGTH, callbacks=callbacks)
+
diff --git a/sample/maze/script/dyna_maze/montecarlo.py b/sample/maze/script/dyna_maze/montecarlo.py
@@ -12,11 +12,19 @@
 sys.path.append(src_path)
 sys.path.append(sample_path)
 
+try:
+    import numpy as np
+    from keras.models import Sequential
+    from keras.layers.core import Dense
+except ImportError:
+    pass
+
 import sample.maze.helper as Helper
 from sample.maze.task import MazeTask
 from sample.maze.callback import MazePerformanceWatcher
 
-from kyoka.algorithm.montecarlo import MonteCarlo, MonteCarloTabularActionValueFunction
+from kyoka.algorithm.montecarlo import MonteCarlo,\
+        MonteCarloTabularActionValueFunction, MonteCarloApproxActionValueFunction
 from kyoka.policy import EpsilonGreedyPolicy
 
 class MazeTabularActionValueFunction(MonteCarloTabularActionValueFunction):
@@ -38,19 +46,46 @@ def insert_value_into_table(self, table, state, action, new_value):
         row, col = state
         table[row][col][action] = new_value
 
+class MazeApproxActionValueFunction(MonteCarloApproxActionValueFunction):
+
+    def __init__(self, task):
+        super(MazeApproxActionValueFunction, self).__init__()
+        self.task = task
+
+    def setup(self):
+        super(MazeApproxActionValueFunction, self).setup()
+        self.model = self._build_linear_model()
+        self.model.compile(loss="mse",  optimizer="adam")
+
+    def _build_linear_model(self):
+        maze_shape = self.task.get_maze_shape()
+        input_dim = maze_shape[0] * maze_shape[1]
+        model = Sequential()
+        model.add(Dense(1, input_dim=input_dim))
+        return model
+
+    def construct_features(self, state, action):
+        return Helper.construct_features(self.task, state, action)
+
+    def approx_predict_value(self, features):
+        return self.model.predict_on_batch(np.array([features]))[0][0]
+
+    def approx_backup(self, features, backup_target, alpha):
+        loss = self.model.train_on_batch(np.array([features]), np.array([backup_target]))
 
 MAZE_FILE_PATH = os.path.join(os.path.dirname(__file__), "dyna.txt")
 
 task = MazeTask()
 task.read_maze(MAZE_FILE_PATH)
 value_func = MazeTabularActionValueFunction(task.get_maze_shape())
+#value_func = MazeApproxActionValueFunction(task)
 
 TEST_LENGTH = 100
 
 policy = EpsilonGreedyPolicy(eps=0.1)
 policy.set_eps_annealing(1.0, 0.1, 50)
 callbacks = [MazePerformanceWatcher()]
-algorithm = MonteCarlo()
+algorithm = MonteCarlo(gamma=0.01)
 algorithm.setup(task, policy, value_func)
 algorithm.run_gpi(TEST_LENGTH, callbacks=callbacks)
 
diff --git a/sample/maze/script/dyna_maze/q_learning.py b/sample/maze/script/dyna_maze/q_learning.py
@@ -12,11 +12,18 @@
 sys.path.append(src_path)
 sys.path.append(sample_path)
 
+try:
+    import numpy as np
+    from keras.models import Sequential
+    from keras.layers.core import Dense
+except ImportError:
+    pass
+
 import sample.maze.helper as Helper
 from sample.maze.task import MazeTask
 from sample.maze.callback import MazePerformanceWatcher
 
-from kyoka.algorithm.q_learning  import QLearning, QLearningTabularActionValueFunction
+from kyoka.algorithm.q_learning  import QLearning, QLearningTabularActionValueFunction, QLearningApproxActionValueFunction
 from kyoka.policy import EpsilonGreedyPolicy
 
 class MazeTabularActionValueFunction(QLearningTabularActionValueFunction):
@@ -38,12 +45,39 @@ def insert_value_into_table(self, table, state, action, new_value):
         row, col = state
         table[row][col][action] = new_value
 
+class MazeApproxActionValueFunction(QLearningApproxActionValueFunction):
+
+    def __init__(self, task):
+        super(MazeApproxActionValueFunction, self).__init__()
+        self.task = task
+
+    def setup(self):
+        super(MazeApproxActionValueFunction, self).setup()
+        self.model = self._build_linear_model()
+        self.model.compile(loss="mse",  optimizer="adam")
+
+    def _build_linear_model(self):
+        maze_shape = self.task.get_maze_shape()
+        input_dim = maze_shape[0] * maze_shape[1]
+        model = Sequential()
+        model.add(Dense(1, input_dim=input_dim))
+        return model
+
+    def construct_features(self, state, action):
+        return Helper.construct_features(self.task, state, action)
+
+    def approx_predict_value(self, features):
+        return self.model.predict_on_batch(np.array([features]))[0][0]
+
+    def approx_backup(self, features, backup_target, alpha):
+        loss = self.model.train_on_batch(np.array([features]), np.array([backup_target]))
 
 MAZE_FILE_PATH = os.path.join(os.path.dirname(__file__), "dyna.txt")
 
 task = MazeTask()
 task.read_maze(MAZE_FILE_PATH)
 value_func = MazeTabularActionValueFunction(task.get_maze_shape())
+#value_func = MazeApproxActionValueFunction(task)
 
 TEST_LENGTH = 100
 policy = EpsilonGreedyPolicy(eps=0.1)