Skip to content

Commit

Permalink
Merge 290b4cc into e6ef6d7
Browse files Browse the repository at this point in the history
  • Loading branch information
ishikota committed Nov 26, 2016
2 parents e6ef6d7 + 290b4cc commit bb684c5
Show file tree
Hide file tree
Showing 24 changed files with 582 additions and 32 deletions.
4 changes: 2 additions & 2 deletions kyoka/algorithm/deep_q_learning.py
Expand Up @@ -78,7 +78,7 @@ def _gen_backup_data(self, task, greedy_policy, value_function, experience):
def _gen_replay_memory_save_path(self, dir_path):
return os.path.join(dir_path, self.SAVE_FILE_NAME)

class BaseDeepQLearningApproxActionValueFunction(BaseApproxActionValueFunction):
class DeepQLearningApproxActionValueFunction(BaseApproxActionValueFunction):

def initialize_network(self):
err_msg = build_not_implemented_msg(self, "initialize_network")
Expand Down Expand Up @@ -170,6 +170,6 @@ def predict_value(value_function, next_state, next_action):

def validate_value_function(value_function):
value_function_check("DeepQLearning",\
[BaseDeepQLearningApproxActionValueFunction],\
[DeepQLearningApproxActionValueFunction],\
value_function)

12 changes: 9 additions & 3 deletions kyoka/algorithm/montecarlo.py
Expand Up @@ -6,6 +6,9 @@

class MonteCarlo(BaseRLAlgorithm):

def __init__(self, gamma=1):
self.gamma = gamma

def setup(self, task, policy, value_function):
validate_value_function(value_function)
super(MonteCarlo, self).setup(task, policy, value_function)
Expand All @@ -20,7 +23,10 @@ def run_gpi_for_an_episode(self, task, policy, value_function):
def _calculate_following_state_reward(self, current_turn, episode):
following_turn_info = episode[current_turn:]
following_reward = [reward for _, _, _, reward in following_turn_info]
return sum(following_reward)
return sum([self.__discount(step, reward) for step, reward in enumerate(following_reward)])

def __discount(self, step, reward):
return self.gamma ** step * reward

class MonteCarloTabularActionValueFunction(BaseTabularActionValueFunction):

Expand Down Expand Up @@ -56,11 +62,11 @@ def _calc_average_in_incremental_way(self, k, r, Q):
def _gen_update_counter_file_path(self, dir_path):
return os.path.join(dir_path, self.SAVE_FILE_NAME)

class BaseMonteCarloApproxActionValueFunction(BaseApproxActionValueFunction):
class MonteCarloApproxActionValueFunction(BaseApproxActionValueFunction):
pass

def validate_value_function(value_function):
value_function_check("MonteCarlo",\
[MonteCarloTabularActionValueFunction, BaseMonteCarloApproxActionValueFunction],\
[MonteCarloTabularActionValueFunction, MonteCarloApproxActionValueFunction],\
value_function)

4 changes: 2 additions & 2 deletions kyoka/algorithm/q_learning.py
Expand Up @@ -39,7 +39,7 @@ def backup(self, state, action, backup_target, alpha):
new_Q_value = Q_value + alpha * (backup_target - Q_value)
self.insert_value_into_table(self.table, state, action, new_Q_value)

class BaseQLearningApproxActionValueFunction(BaseApproxActionValueFunction):
class QLearningApproxActionValueFunction(BaseApproxActionValueFunction):
pass

ACTION_ON_TERMINAL_FLG = "action_on_terminal"
Expand All @@ -58,6 +58,6 @@ def predict_value(value_function, next_state, next_action):

def validate_value_function(value_function):
value_function_check("QLearning",\
[QLearningTabularActionValueFunction, BaseQLearningApproxActionValueFunction],\
[QLearningTabularActionValueFunction, QLearningApproxActionValueFunction],\
value_function)

4 changes: 2 additions & 2 deletions kyoka/algorithm/sarsa.py
Expand Up @@ -36,7 +36,7 @@ def backup(self, state, action, backup_target, alpha):
new_Q_value = Q_value + alpha * (backup_target - Q_value)
self.insert_value_into_table(self.table, state, action, new_Q_value)

class BaseSarsaApproxActionValueFunction(BaseApproxActionValueFunction):
class SarsaApproxActionValueFunction(BaseApproxActionValueFunction):
pass

ACTION_ON_TERMINAL_FLG = "action_on_terminal"
Expand All @@ -55,6 +55,6 @@ def predict_value(value_function, next_state, next_action):

def validate_value_function(value_function):
value_function_check("Sarsa",\
[SarsaTabularActionValueFunction, BaseSarsaApproxActionValueFunction],\
[SarsaTabularActionValueFunction, SarsaApproxActionValueFunction],\
value_function)

66 changes: 66 additions & 0 deletions sample/maze/README.md
@@ -0,0 +1,66 @@
# Maze Task example
Simple testbet for reinforcement learning algorithms.
The goal of agent is to escape from the maze in minimum steps.

## Sample mazes
We prepared 3 kinds of mazes which are used in the book [Reinforcement Learning: An Introduction](https://webdocs.cs.ualberta.ca/~sutton/book/the-book-2nd.html)
###1. Dyna Maze
The most simple test bet.
`S` indicates the start point of maze, `G` is the goal and `X` is the block(agent cannot move on block cell).
```
-------XG
--X----X-
S-X----X-
--X------
-----X---
---------
```

###2. Blocking Maze
After some steps the structure of maze transforms. In transforming, past best path is blocked.
So agent should realize that maze was transformed and learn another path.

```
Before After
--------G --------G
--------- ---------
--------- => ---------
XXXXXXXX- -XXXXXXXX
--------- ---------
---S----- ---S-----
```

###3. Shortcut Maze
After some steps the structure of maze transforms. In transforming, new best path(shortcut) appears.
So agent should realize that the better path is appeared by exploring.
```
Before After
--------G --------G
--------- ---------
--------- => ---------
-XXXXXXXX -XXXXXXX-
--------- ---------
---S----- ---S-----
```

## Sample code
We prepared sample code to try these mazes by RL algorithms.
You can checkout sample code under [script](./script) directory.

If you want to try `Dyna Maze` by `montecarlo` method, run `python sample/maze/script/dyna_maze/montecarlo.py`
After training is finished, the policy which agent learned would be visualized on console like below.
```
>>> python sample/maze/script/dyna_maze/montecarlo.py
[Progress] Start GPI iteration for 100 times
(some logs are output...)
[Progress] Completed GPI iteration for 100 times. (total time: 0s)
[MazePerformanceWatcher] Policy which agent learned is like this.
v>>>vvv-G
v^-^>vv-^
>v->>>v-^
vv->^^>>^
>v<^<-^>^
^>>>>>^>^
```

8 changes: 8 additions & 0 deletions sample/maze/helper.py
Expand Up @@ -21,6 +21,14 @@ def visualize_policy(task, value_function):
visualized_actions = [[flg2icon(flg) for flg in line] for line in actions]
return visualize_maze(["".join(line) for line in visualized_actions])

def construct_features(task, state, action):
w, h = task.get_maze_shape()
next_state = task.transit_state(state, action)
onehot = [[1 if next_state == (row, col) else 0 for col in range(h)] for row in range(w)]
return _flatten(onehot)

def _flatten(table):
return [item for sublist in table for item in sublist]

def _find_best_actions_on_each_cell(task, value_function):
height, width = task.get_maze_shape()
Expand Down
2 changes: 1 addition & 1 deletion sample/maze/script/blocking_maze/montecarlo.py
Expand Up @@ -54,7 +54,7 @@ def insert_value_into_table(self, table, state, action, new_value):
transfomer.set_transformation(50, TRANSFORMATION_FILE_PATH)
callbacks.append(transfomer)

algorithm = MonteCarlo()
algorithm = MonteCarlo(gamma=0.1)
algorithm.setup(task, policy, value_func)
algorithm.run_gpi(TEST_LENGTH, callbacks=callbacks)

72 changes: 72 additions & 0 deletions sample/maze/script/dyna_maze/deep_q_learning.py
@@ -0,0 +1,72 @@
#!/usr/local/bin/python

# Resolve path configucation
import os
import sys
import argparse

root = os.path.join(os.path.dirname(__file__), "../"*4)
src_path = os.path.join(root, "kyoka")
sample_path = os.path.join(root, "sample")
sys.path.append(root)
sys.path.append(src_path)
sys.path.append(sample_path)

import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense

import sample.maze.helper as Helper
from sample.maze.task import MazeTask
from sample.maze.callback import MazePerformanceWatcher

from kyoka.algorithm.deep_q_learning import DeepQLearning, DeepQLearningApproxActionValueFunction
from kyoka.policy import EpsilonGreedyPolicy

class MazeApproxActionValueFunction(DeepQLearningApproxActionValueFunction):

def __init__(self, task):
super(MazeApproxActionValueFunction, self).__init__()
self.task = task

def initialize_network(self):
maze_shape = self.task.get_maze_shape()
input_dim = maze_shape[0] * maze_shape[1]
model = Sequential()
model.add(Dense(1, input_dim=input_dim))
model.compile(loss="mse", optimizer="adam")
return model

def deepcopy_network(self, q_network):
q_hat_network = self.initialize_network()
for original_layer, copy_layer in zip(q_network.layers, q_hat_network.layers):
copy_layer.set_weights(original_layer.get_weights())
return q_hat_network

def predict_value_by_network(self, network, state, action):
features = self.construct_features(state, action)
return network.predict_on_batch(np.array([features]))[0][0]

def backup_on_minibatch(self, q_network, backup_minibatch):
minibatch = [(self.construct_features(state, action), target) for state, action, target in backup_minibatch]
X = np.array([x for x, _ in minibatch])
y = np.array([y for _, y in minibatch])
loss = q_network.train_on_batch(X, y)

def construct_features(self, state, action):
return Helper.construct_features(self.task, state, action)

MAZE_FILE_PATH = os.path.join(os.path.dirname(__file__), "dyna.txt")

task = MazeTask()
task.read_maze(MAZE_FILE_PATH)
value_func = MazeApproxActionValueFunction(task)

TEST_LENGTH = 100
policy = EpsilonGreedyPolicy(eps=0.1)
policy.set_eps_annealing(1.0, 0.1, 50)
callbacks = [MazePerformanceWatcher()]
algorithm = DeepQLearning(N=100, C=100, minibatch_size=32, replay_start_size=50)
algorithm.setup(task, policy, value_func)
algorithm.run_gpi(TEST_LENGTH, callbacks=callbacks)

39 changes: 37 additions & 2 deletions sample/maze/script/dyna_maze/montecarlo.py
Expand Up @@ -12,11 +12,19 @@
sys.path.append(src_path)
sys.path.append(sample_path)

try:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense
except ImportError:
pass

import sample.maze.helper as Helper
from sample.maze.task import MazeTask
from sample.maze.callback import MazePerformanceWatcher

from kyoka.algorithm.montecarlo import MonteCarlo, MonteCarloTabularActionValueFunction
from kyoka.algorithm.montecarlo import MonteCarlo,\
MonteCarloTabularActionValueFunction, MonteCarloApproxActionValueFunction
from kyoka.policy import EpsilonGreedyPolicy

class MazeTabularActionValueFunction(MonteCarloTabularActionValueFunction):
Expand All @@ -38,19 +46,46 @@ def insert_value_into_table(self, table, state, action, new_value):
row, col = state
table[row][col][action] = new_value

class MazeApproxActionValueFunction(MonteCarloApproxActionValueFunction):

def __init__(self, task):
super(MazeApproxActionValueFunction, self).__init__()
self.task = task

def setup(self):
super(MazeApproxActionValueFunction, self).setup()
self.model = self._build_linear_model()
self.model.compile(loss="mse", optimizer="adam")

def _build_linear_model(self):
maze_shape = self.task.get_maze_shape()
input_dim = maze_shape[0] * maze_shape[1]
model = Sequential()
model.add(Dense(1, input_dim=input_dim))
return model

def construct_features(self, state, action):
return Helper.construct_features(self.task, state, action)

def approx_predict_value(self, features):
return self.model.predict_on_batch(np.array([features]))[0][0]

def approx_backup(self, features, backup_target, alpha):
loss = self.model.train_on_batch(np.array([features]), np.array([backup_target]))

MAZE_FILE_PATH = os.path.join(os.path.dirname(__file__), "dyna.txt")

task = MazeTask()
task.read_maze(MAZE_FILE_PATH)
value_func = MazeTabularActionValueFunction(task.get_maze_shape())
#value_func = MazeApproxActionValueFunction(task)

TEST_LENGTH = 100

policy = EpsilonGreedyPolicy(eps=0.1)
policy.set_eps_annealing(1.0, 0.1, 50)
callbacks = [MazePerformanceWatcher()]
algorithm = MonteCarlo()
algorithm = MonteCarlo(gamma=0.01)
algorithm.setup(task, policy, value_func)
algorithm.run_gpi(TEST_LENGTH, callbacks=callbacks)

36 changes: 35 additions & 1 deletion sample/maze/script/dyna_maze/q_learning.py
Expand Up @@ -12,11 +12,18 @@
sys.path.append(src_path)
sys.path.append(sample_path)

try:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense
except ImportError:
pass

import sample.maze.helper as Helper
from sample.maze.task import MazeTask
from sample.maze.callback import MazePerformanceWatcher

from kyoka.algorithm.q_learning import QLearning, QLearningTabularActionValueFunction
from kyoka.algorithm.q_learning import QLearning, QLearningTabularActionValueFunction, QLearningApproxActionValueFunction
from kyoka.policy import EpsilonGreedyPolicy

class MazeTabularActionValueFunction(QLearningTabularActionValueFunction):
Expand All @@ -38,12 +45,39 @@ def insert_value_into_table(self, table, state, action, new_value):
row, col = state
table[row][col][action] = new_value

class MazeApproxActionValueFunction(QLearningApproxActionValueFunction):

def __init__(self, task):
super(MazeApproxActionValueFunction, self).__init__()
self.task = task

def setup(self):
super(MazeApproxActionValueFunction, self).setup()
self.model = self._build_linear_model()
self.model.compile(loss="mse", optimizer="adam")

def _build_linear_model(self):
maze_shape = self.task.get_maze_shape()
input_dim = maze_shape[0] * maze_shape[1]
model = Sequential()
model.add(Dense(1, input_dim=input_dim))
return model

def construct_features(self, state, action):
return Helper.construct_features(self.task, state, action)

def approx_predict_value(self, features):
return self.model.predict_on_batch(np.array([features]))[0][0]

def approx_backup(self, features, backup_target, alpha):
loss = self.model.train_on_batch(np.array([features]), np.array([backup_target]))

MAZE_FILE_PATH = os.path.join(os.path.dirname(__file__), "dyna.txt")

task = MazeTask()
task.read_maze(MAZE_FILE_PATH)
value_func = MazeTabularActionValueFunction(task.get_maze_shape())
#value_func = MazeApproxActionValueFunction(task)

TEST_LENGTH = 100
policy = EpsilonGreedyPolicy(eps=0.1)
Expand Down

0 comments on commit bb684c5

Please sign in to comment.