Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to Revision2 #35

Merged
merged 21 commits into from
Sep 22, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DP/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
There are 3 programs are available.

* To understand MDP: `environment.py`
* `python run_random_agent.py`
* `python environment_demo.py`
* To understand Bellman Equation: `bellman_equation.py`
* `python bellman_equation.py`
* To understand Dynamic Programming: `planner.py`
Expand Down
4 changes: 2 additions & 2 deletions DP/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import tornado.web
import tornado.escape
from DP.environment import Environment
from DP.planner import ValuteIterationPlanner, PolicyIterationPlanner
from DP.planner import ValueIterationPlanner, PolicyIterationPlanner


class IndexHandler(tornado.web.RequestHandler):
Expand All @@ -26,7 +26,7 @@ def post(self):

env = Environment(grid, move_prob=move_prob)
if plan_type == "value":
planner = ValuteIterationPlanner(env)
planner = ValueIterationPlanner(env)
elif plan_type == "policy":
planner = PolicyIterationPlanner(env)

Expand Down
2 changes: 1 addition & 1 deletion DP/planner.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def dict_to_grid(self, state_reward_dict):
return grid


class ValuteIterationPlanner(Planner):
class ValueIterationPlanner(Planner):

def __init__(self, env):
super().__init__(env)
Expand Down
2 changes: 1 addition & 1 deletion DP/tests/test_environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_run_environment(self):
self.assertEqual(state.column, 0)
goal = False
for t in range(10):
action = random.choice(env.action_space)
action = random.choice(env.actions)
state, reward, done = env.step(action)
self.assertTrue(0 <= state.row < len(env.grid))
self.assertTrue(0 <= state.column < len(env.grid[0]))
Expand Down
4 changes: 2 additions & 2 deletions DP/tests/test_planner.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import random
import unittest
from DP.environment import Environment
from DP.planner import ValuteIterationPlanner, PolicyIterationPlanner
from DP.planner import ValueIterationPlanner, PolicyIterationPlanner


class TestPlanner(unittest.TestCase):

def test_value_iteration(self):
grid = self.get_sample_grid()
env = Environment(grid)
planner = ValuteIterationPlanner(env)
planner = ValueIterationPlanner(env)
result = planner.plan()
print("Value Iteration")
for r in result:
Expand Down
4 changes: 2 additions & 2 deletions EV/evolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from PIL import Image
import matplotlib.pyplot as plt
import gym
import gym_ple

# Disable TensorFlow GPU for parallel execution
if os.name == "nt":
Expand Down Expand Up @@ -71,6 +70,7 @@ def play(self, env, episode_count=5, render=True):
class CatcherObserver():

def __init__(self, width, height, frame_count):
import gym_ple
self._env = gym.make("Catcher-v0")
self.width = width
self.height = height
Expand All @@ -87,7 +87,7 @@ def reset(self):
return self.transform(self._env.reset())

def render(self):
self._env.render()
self._env.render(mode="human")

def step(self, action):
n_state, reward, done, info = self._env.step(action)
Expand Down
182 changes: 92 additions & 90 deletions FN/a2c_agent.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,27 @@
import random
import argparse
from collections import deque
import numpy as np
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.python import keras as K
from PIL import Image
import gym
import gym_ple
from fn_framework import FNAgent, Trainer, Observer, Experience
from fn_framework import FNAgent, Trainer, Observer
tf.compat.v1.disable_eager_execution()


class ActorCriticAgent(FNAgent):

def __init__(self, epsilon, actions):
super().__init__(epsilon, actions)
def __init__(self, actions):
# ActorCriticAgent uses self policy (doesn't use epsilon).
super().__init__(epsilon=0.0, actions=actions)
self._updater = None

@classmethod
def load(cls, env, model_path, epsilon=0.0001):
def load(cls, env, model_path):
actions = list(range(env.action_space.n))
agent = cls(epsilon, actions)
agent = cls(actions)
agent.model = K.models.load_model(model_path, custom_objects={
"SampleLayer": SampleLayer})
agent.initialized = True
Expand All @@ -30,6 +32,7 @@ def initialize(self, experiences, optimizer):
self.make_model(feature_shape)
self.set_updater(optimizer)
self.initialized = True
print("Done initialization. From now, begin training!")

def make_model(self, feature_shape):
normal = K.initializers.glorot_normal()
Expand Down Expand Up @@ -61,17 +64,18 @@ def make_model(self, feature_shape):

def set_updater(self, optimizer,
value_loss_weight=1.0, entropy_weight=0.1):
actions = tf.placeholder(shape=(None), dtype="int32")
rewards = tf.placeholder(shape=(None), dtype="float32")
actions = tf.compat.v1.placeholder(shape=(None), dtype="int32")
values = tf.compat.v1.placeholder(shape=(None), dtype="float32")

_, action_evals, values = self.model.output
_, action_evals, estimateds = self.model.output

neg_logs = tf.nn.sparse_softmax_cross_entropy_with_logits(
logits=action_evals, labels=actions)
advantages = rewards - values
# tf.stop_gradient: Prevent policy_loss influences critic_layer.
advantages = values - tf.stop_gradient(estimateds)

policy_loss = tf.reduce_mean(neg_logs * tf.nn.softplus(advantages))
value_loss = tf.losses.mean_squared_error(rewards, values)
policy_loss = tf.reduce_mean(neg_logs * advantages)
value_loss = tf.keras.losses.MeanSquaredError()(values, estimateds)
action_entropy = tf.reduce_mean(self.categorical_entropy(action_evals))

loss = policy_loss + value_loss_weight * value_loss
Expand All @@ -82,28 +86,28 @@ def set_updater(self, optimizer,

self._updater = K.backend.function(
inputs=[self.model.input,
actions, rewards],
actions, values],
outputs=[loss,
policy_loss,
value_loss,
tf.reduce_mean(neg_logs),
tf.reduce_mean(advantages),
value_loss,
action_entropy],
updates=updates)

def categorical_entropy(self, logits):
"""
From OpenAI baseline implementation
From OpenAI baseline implementation.
https://github.com/openai/baselines/blob/master/baselines/common/distributions.py#L192
"""
a0 = logits - tf.reduce_max(logits, axis=-1, keepdims=True)
ea0 = tf.exp(a0)
z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
p0 = ea0 / z0
return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
return tf.reduce_sum(p0 * (tf.math.log(z0) - a0), axis=-1)

def policy(self, s):
if np.random.random() < self.epsilon or not self.initialized:
if not self.initialized:
return np.random.randint(len(self.actions))
else:
action, action_evals, values = self.model.predict(np.array([s]))
Expand All @@ -127,8 +131,8 @@ def build(self, input_shape):
super(SampleLayer, self).build(input_shape)

def call(self, x):
noise = tf.random_uniform(tf.shape(x))
return tf.argmax(x - tf.log(-tf.log(noise)), axis=1)
noise = tf.random.uniform(tf.shape(x))
return tf.argmax(x - tf.math.log(-tf.math.log(noise)), axis=1)

def compute_output_shape(self, input_shape):
return (input_shape[0], self.output_dim)
Expand All @@ -139,9 +143,9 @@ class ActorCriticAgentTest(ActorCriticAgent):
def make_model(self, feature_shape):
normal = K.initializers.glorot_normal()
model = K.Sequential()
model.add(K.layers.Dense(64, input_shape=feature_shape,
model.add(K.layers.Dense(10, input_shape=feature_shape,
kernel_initializer=normal, activation="relu"))
model.add(K.layers.Dense(64, kernel_initializer=normal,
model.add(K.layers.Dense(10, kernel_initializer=normal,
activation="relu"))

actor_layer = K.layers.Dense(len(self.actions),
Expand Down Expand Up @@ -177,35 +181,31 @@ def transform(self, state):
else:
self._frames.append(normalized)
feature = np.array(self._frames)
# Convert the feature shape (f, w, h) => (w, h, f).
# Convert the feature shape (f, w, h) => (h, w, f).
feature = np.transpose(feature, (1, 2, 0))
return feature


class ActorCriticTrainer(Trainer):

def __init__(self, buffer_size=50000, batch_size=32,
gamma=0.99, initial_epsilon=0.1, final_epsilon=1e-3,
learning_rate=1e-3, report_interval=10,
log_dir="", file_name=""):
def __init__(self, buffer_size=256, batch_size=32,
gamma=0.99, learning_rate=1e-3,
report_interval=10, log_dir="", file_name=""):
super().__init__(buffer_size, batch_size, gamma,
report_interval, log_dir)
self.file_name = file_name if file_name else "a2c_agent.h5"
self.initial_epsilon = initial_epsilon
self.final_epsilon = final_epsilon
self.learning_rate = learning_rate
self.d_experiences = deque(maxlen=self.buffer_size)
self.training_episode = 0
self.losses = {}
self.rewards = []
self._max_reward = -10

def train(self, env, episode_count=900, initial_count=10,
test_mode=False, render=False, observe_interval=100):
actions = list(range(env.action_space.n))
if not test_mode:
agent = ActorCriticAgent(1.0, actions)
agent = ActorCriticAgent(actions)
else:
agent = ActorCriticAgentTest(1.0, actions)
agent = ActorCriticAgentTest(actions)
observe_interval = 0
self.training_episode = episode_count

Expand All @@ -214,76 +214,78 @@ def train(self, env, episode_count=900, initial_count=10,
return agent

def episode_begin(self, episode, agent):
self.losses = {}
for key in ["loss", "loss_policy", "loss_action", "loss_advantage",
"loss_value", "entropy"]:
self.losses[key] = []
self.experiences = []
self.rewards = []

def step(self, episode, step_count, agent, experience):
if self.training:
loss, lp, ac, ad, vl, en = agent.update(*self.make_batch())
self.losses["loss"].append(loss)
self.losses["loss_policy"].append(lp)
self.losses["loss_action"].append(ac)
self.losses["loss_advantage"].append(ad)
self.losses["loss_value"].append(vl)
self.losses["entropy"].append(en)

def make_batch(self):
batch = random.sample(self.d_experiences, self.batch_size)
states = [e.s for e in batch]
actions = [e.a for e in batch]
rewards = [e.r for e in batch]
return states, actions, rewards

def begin_train(self, episode, agent):
self.logger.set_model(agent.model)
agent.epsilon = self.initial_epsilon
self.training_episode -= episode
print("Done initialization. From now, begin training!")

def episode_end(self, episode, step_count, agent):
rewards = [e.r for e in self.experiences]
self.reward_log.append(sum(rewards))

self.rewards.append(experience.r)
if not agent.initialized:
optimizer = K.optimizers.Adam(lr=self.learning_rate, clipnorm=5.0)
agent.initialize(self.experiences, optimizer)
if len(self.experiences) < self.buffer_size:
# Store experience until buffer_size (enough to initialize).
return False

discounteds = []
for t, r in enumerate(rewards):
future_r = [_r * (self.gamma ** i) for i, _r in
enumerate(rewards[t:])]
_r = sum(future_r)
discounteds.append(_r)

for i, e in enumerate(self.experiences):
s, a, r, n_s, d = e
d_r = discounteds[i]
d_e = Experience(s, a, d_r, n_s, d)
self.d_experiences.append(d_e)

if not self.training and len(self.d_experiences) == self.buffer_size:
self.begin_train(i, agent)
optimizer = K.optimizers.Adam(lr=self.learning_rate,
clipnorm=5.0)
agent.initialize(self.experiences, optimizer)
self.logger.set_model(agent.model)
self.training = True
self.experiences.clear()
else:
if len(self.experiences) < self.batch_size:
# Store experience until batch_size (enough to update).
return False

batch = self.make_batch(agent)
loss, lp, lv, p_ng, p_ad, p_en = agent.update(*batch)
# Record latest metrics.
self.losses["loss/total"] = loss
self.losses["loss/policy"] = lp
self.losses["loss/value"] = lv
self.losses["policy/neg_logs"] = p_ng
self.losses["policy/advantage"] = p_ad
self.losses["policy/entropy"] = p_en
self.experiences.clear()

def make_batch(self, agent):
states = []
actions = []
values = []
experiences = list(self.experiences)
states = np.array([e.s for e in experiences])
actions = np.array([e.a for e in experiences])

# Calculate values.
# If the last experience isn't terminal (done) then estimates value.
last = experiences[-1]
future = last.r if last.d else agent.estimate(last.n_s)
for e in reversed(experiences):
value = e.r
if not e.d:
value += self.gamma * future
values.append(value)
future = value
values = np.array(list(reversed(values)))

scaler = StandardScaler()
values = scaler.fit_transform(values.reshape((-1, 1))).flatten()

return states, actions, values

def episode_end(self, episode, step_count, agent):
reward = sum(self.rewards)
self.reward_log.append(reward)

if self.training:
reward = sum(rewards)
if agent.initialized:
self.logger.write(self.training_count, "reward", reward)
self.logger.write(self.training_count, "reward_max", max(rewards))
self.logger.write(self.training_count, "epsilon", agent.epsilon)
self.logger.write(self.training_count, "reward_max",
max(self.rewards))

for k in self.losses:
loss = sum(self.losses[k]) / step_count
self.logger.write(self.training_count, "loss/" + k, loss)
self.logger.write(self.training_count, k, self.losses[k])

if reward > self._max_reward:
agent.save(self.logger.path_of(self.file_name))
self._max_reward = reward

diff = (self.initial_epsilon - self.final_epsilon)
decay = diff / self.training_episode
agent.epsilon = max(agent.epsilon - decay, self.final_epsilon)

if self.is_event(episode, self.report_interval):
recent_rewards = self.reward_log[-self.report_interval:]
self.logger.describe("reward", recent_rewards, episode=episode)
Expand Down
Loading