Skip to content

Commit

Permalink
DQN first full agent integration with Lab (#24)
Browse files Browse the repository at this point in the history
* Vanilla dqn implementation

* Indexing fix

* Adding comments, style fixes

* DQN change of init notes

* Fixing file structure and function naming

* Changing DQN init, and starting setup for more general DQN implementation

* Adding absolute timestep to train a batch

* update README

* fix data_space bool

* rename top level data space to info space

* rename AEBDataSpace to DataSpace

* add clock to aebspace

* fix comment and import error for dqn work

* update DQNBase init and variable names

* add missing action back to update()

* rename terminal to done, indices to idxs

* swap next_state and done ordering; reorder method, fix var not declared

* add proper memory reset state method

* rename memory to replay.Replay, propagate, update init method

* temporarily fragment memory class to pass tests

* tmp fix conftest, mute memory test for now

* introduce generic post_body_init to fix body-dependent init

* refactor memory methods to abstract class

* update spec setting in agent, env

* redesign spec, update spec_util and guard env spec defaulting

* pass unity env config, update spec

* remove timestep from meta spec, update dqn spec

* fix DQN agent init to get working

* make post_body_init abstract required method

* use new update method, call algo update

* add algo train method, propagate. random agent working

* fix memory numpy spread value, activate memory

* fix CC

* rename algorithm_util

* refactor test_memory

* fix algorithm, explore_anneal_epi

* fix spec for multi_env

* fix control and spec

* fix dqn, running now!

* add TODO
  • Loading branch information
kengz committed Dec 3, 2017
1 parent cd37046 commit 425f3a1
Show file tree
Hide file tree
Showing 26 changed files with 857 additions and 343 deletions.
13 changes: 7 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# SLM Lab [![CircleCI](https://circleci.com/gh/kengz/SLM-Lab.svg?style=shield)](https://circleci.com/gh/kengz/SLM-Lab) [![Maintainability](https://api.codeclimate.com/v1/badges/20c6a124c468b4d3e967/maintainability)](https://codeclimate.com/github/kengz/SLM-Lab/maintainability) [![Test Coverage](https://api.codeclimate.com/v1/badges/20c6a124c468b4d3e967/test_coverage)](https://codeclimate.com/github/kengz/SLM-Lab/test_coverage)
_(Work In Progress)_ An experimental framework for Reinforcement Learning using Unity and PyTorch.
_(Work In Progress)_ An experimentation framework for Reinforcement Learning using Unity and PyTorch.

## Installation

Expand Down Expand Up @@ -123,7 +123,6 @@ When controlling a session of experiment, execute the agent and environment logi
Hence, the experiment session loop generalizes directly from:
```
state = self.env.reset()
logger.debug(f'reset state {state}')
self.agent.reset()
# RL steps for SARS
for t in range(self.env.max_timestep):
Expand All @@ -144,9 +143,11 @@ self.agent_space.reset()
# RL steps for SARS
for t in range(self.env_space.max_timestep):
action_space = self.agent_space.act(state_space)
reward_space, state_space, done_space = self.env_space.step(action_space)
# fully observable SARS from env_space, memory and training internally
self.agent_space.update(reward_space, state_space)
if done_space.done():
logger.debug(f'action_space {action_space}')
(reward_space, state_space,
done_space) = self.env_space.step(action_space)
# completes cycle of full info for agent_space
self.agent_space.update(reward_space, state_space, done_space)
if bool(done_space):
break
```
6 changes: 4 additions & 2 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@


def main():
logger.set_level('DEBUG')
spec = spec_util.get('base.json', 'general_custom')
# logger.set_level('DEBUG')
# spec = spec_util.get('base.json', 'base_case')
# spec = spec_util.get('base.json', 'multi_env')
spec = spec_util.get('dqn.json', 'dqn_base_case')
trial = Trial(spec)
trial_data = trial.run()

Expand Down
57 changes: 32 additions & 25 deletions slm_lab/agent/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,12 @@
- high level properties of thinking, e.g. creativity, planning.
Agent components:
- algorithm
- algorithm (with net, policy)
- memory
- net
- policy
'''
from slm_lab.agent import algorithm
from slm_lab.experiment.monitor import data_space
import pydash as _
from slm_lab.agent import algorithm, memory
from slm_lab.experiment.monitor import info_space
from slm_lab.lib import util


Expand All @@ -35,37 +34,38 @@ class Agent:

def __init__(self, spec, agent_space, a=0):
self.spec = spec
util.set_attr(self, self.spec)
self.name = self.spec['name']
self.agent_space = agent_space
self.index = a
self.eb_proj = self.agent_space.a_eb_proj[self.index]
self.bodies = None # consistent with ab_proj, set in aeb_space.init_body_space()

AlgoClass = getattr(algorithm, self.name)
# TODO repattern, redesign spec
AlgoClass = getattr(algorithm, _.get(self.spec, 'algorithm.name'))
self.algorithm = AlgoClass(self)
# TODO also resolve architecture and data input, output dims via some architecture spec
self.memory = None
self.net = None
MemoryClass = getattr(memory, _.get(self.spec, 'memory.name'))
self.memory = MemoryClass(self)

def reset(self):
def post_body_init(self):
'''Run init for components that need bodies to exist first, e.g. memory or architecture.'''
self.memory.post_body_init()
self.algorithm.post_body_init()

def reset(self, state):
'''Do agent reset per episode, such as memory pointer'''
# TODO implement
return
self.memory.reset_last_state(state)

def act(self, state):
'''Standard act method from algorithm.'''
return self.algorithm.act(state)

def update(self, reward, state, done):
def update(self, action, reward, state, done):
'''
Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net
'''
# TODO build and access timestep, episode, absolute number of timesteps from Dataspace
# TODO implement generic method, work on AEB
# self.memory.update()
# self.net.train()
self.algorithm.update(reward, state, done)
return
self.memory.update(action, reward, state, done)
self.algorithm.train()
self.algorithm.update()

def close(self):
'''Close agent at the end of a session, e.g. save model'''
Expand All @@ -88,12 +88,18 @@ def __init__(self, spec, aeb_space):
self.agents = [Agent(a_spec, self, a)
for a, a_spec in enumerate(spec['agent'])]

def post_body_init(self):
'''Run init for components that need bodies to exist first, e.g. memory or architecture.'''
for agent in self.agents:
agent.post_body_init()

def get(self, a):
return self.agents[a]

def reset(self):
for agent in self.agents:
agent.reset()
def reset(self, state_space):
for a, agent in enumerate(self.agents):
state = state_space.get(a=a)
agent.reset(state)

def act(self, state_space):
action_proj = []
Expand All @@ -104,12 +110,13 @@ def act(self, state_space):
action_space = self.aeb_space.add('action', action_proj)
return action_space

def update(self, reward_space, state_space, done_space):
def update(self, action_space, reward_space, state_space, done_space):
for a, agent in enumerate(self.agents):
action = action_space.get(a=a)
reward = reward_space.get(a=a)
state = state_space.get(a=a)
done = done_space.get(a=a)
agent.update(reward, state, done)
agent.update(action, reward, state, done)

def close(self):
for agent in self.agents:
Expand Down
1 change: 1 addition & 0 deletions slm_lab/agent/algorithm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@

# expose all the classes
from .random import *
from .dqn import *
60 changes: 60 additions & 0 deletions slm_lab/agent/algorithm/algorithm_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import numpy as np
import torch
from torch.autograd import Variable

'''Functions used by more than one algorithm'''


def act_with_epsilon_greedy(net, state, epsilon):
'''
With probability episilon select a random action,
otherwise select the action associated with the
largest q value
'''
# TODO store one hot
# TODO discrete int
a_dim = net.out_dim
print(f'epsilon {epsilon}')
if epsilon > np.random.rand():
print('random action')
action = np.random.randint(a_dim)
else:
print('net action')
torch_state = Variable(torch.from_numpy(state).float())
out = net.wrap_eval(torch_state)
action = int(torch.max(out, dim=0)[1][0])
return action


def act_with_boltzmann(net, state, tau):
# TODO implement act_with_boltzmann
pass


def act_with_gaussian(net, state, stddev):
# TODO implement act_with_gaussian
pass


def update_epsilon_greedy(net, state, stddev):
# TODO implement act_with_gaussian
pass


def update_boltzmann(net, state, stddev):
# TODO implement act_with_gaussian
pass


def update_gaussian(net, state, stddev):
# TODO implement act_with_gaussian
pass


act_fns = {'epsilon_greedy': act_with_epsilon_greedy,
'boltzmann': act_with_boltzmann,
'gaussian': act_with_gaussian}

update_fns = {'epsilon_greedy': update_epsilon_greedy,
'boltzmann': update_boltzmann,
'gaussian': update_gaussian}
12 changes: 10 additions & 2 deletions slm_lab/agent/algorithm/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,15 @@ def __init__(self, agent):
self.agent = agent

@abstractmethod
def post_body_init(self):
'''Initializes the part of algorithm needing a body to exist first.'''
raise NotImplementedError

def body_act_discrete(self, body, body_state):
'''Implement atomic discrete body_action, or throw NotImplementedError. E.g. fetch body_action from net given body info.'''
raise NotImplementedError
return body_action

@abstractmethod
def body_act_continuous(self, body, body_state):
'''Implement atomic continuous body_action, or throw NotImplementedError. E.g. fetch body_action from net given body info.'''
raise NotImplementedError
Expand All @@ -40,6 +43,11 @@ def act(self, state):
return action

@abstractmethod
def update(self, reward, state, done):
def train(self):
'''Implement algorithm train, or throw NotImplementedError'''
raise NotImplementedError

@abstractmethod
def update(self):
'''Implement algorithm update, or throw NotImplementedError'''
raise NotImplementedError
115 changes: 115 additions & 0 deletions slm_lab/agent/algorithm/dqn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import numpy as np
import torch
from torch.autograd import Variable
from slm_lab.agent.algorithm.base import Algorithm
from slm_lab.agent.algorithm.algorithm_util import act_fns, update_fns
from slm_lab.agent.net import nets
from slm_lab.agent.memory import Replay


class DQNBase(Algorithm):
'''
Implementation of the base DQN algorithm.
See Sergey Levine's lecture xxx for more details
TODO add link
more detailed comments
net: instance of an slm_lab/agent/net
memory: instance of an slm_lab/agent/memory
batch_size: how many examples from memory to sample at each training step
action_selection: function (from common.py) that determines how to select actions
gamma: Real number in range [0, 1]. Determines how much to discount the future
state_dim: dimension of the state space
action_dim: dimensions of the action space
'''

def __init__(self, agent):
super(DQNBase, self).__init__(agent)

def post_body_init(self):
'''Initializes the part of algorithm needing a body to exist first.'''
# TODO generalize
default_body = self.agent.bodies[0]
state_dim = default_body.state_dim
action_dim = default_body.action_dim
net_spec = self.agent.spec['net']
net_spec['net_layer_params'][0] = state_dim
net_spec['net_layer_params'][-1] = action_dim
self.net = nets[net_spec['net_type']](
*net_spec['net_layer_params'],
*net_spec['net_other_params'])
# TODO three nets for different part of Q function
# In base algorithm should all be pointer to the same net - then update compute q target values and action functions
self.batch_size = net_spec['batch_size']
self.gamma = net_spec['gamma']

algorithm_spec = self.agent.spec['algorithm']
self.action_selection = act_fns[algorithm_spec['action_selection']]

# explore_var is epsilon, tau or etc.
self.explore_var_start = algorithm_spec['explore_var_start']
self.explore_var_end = algorithm_spec['explore_var_end']
self.explore_var = self.explore_var_start
self.explore_anneal_epi = algorithm_spec['explore_anneal_epi']
self.training_iters_per_batch = 1
self.training_frequency = 1

def compute_q_target_values(self, batch):
# Make future reward 0 if the current state is done
float_data_list = [
'states', 'actions', 'rewards', 'dones', 'next_states']
for k in float_data_list:
batch[k] = Variable(torch.from_numpy(batch[k]).float())
# print('batch')
# print(batch['states'])
# print(batch['actions'])
# print(batch['rewards'])
# print(batch['dones'])
# print(1 - batch['dones'])
q_vals = self.net.wrap_eval(batch['states'])
# print(f'q_vals {q_vals}')
q_targets_all = batch['rewards'].data + self.gamma * \
torch.mul((1 - batch['dones'].data),
self.net.wrap_eval(batch['next_states']))
# print(f'q_targets_all {q_targets_all}')
q_targets_max, _ = torch.max(q_targets_all, dim=1)
# print(f'q_targets_max {q_targets_max}')
# print(f'q_targets_all size {q_targets_all.size()}')

# We only want to train the network for the action selected
# For all other actions we set the q_target = q_vals
# So that the loss for these actions is 0
q_targets_max.unsqueeze_(1)
# print(f'q_targets_max {q_targets_max}')
q_targets = torch.mul(q_targets_max, batch['actions'].data) + \
torch.mul(q_vals, (1 - batch['actions'].data))
# print(f'q_targets {q_targets}')
return q_targets

def train(self):
# TODO Fix for training iters, docstring
t = self.agent.agent_space.aeb_space.clock['t']
if t % self.training_frequency == 0:
batch = self.agent.memory.get_batch(self.batch_size)
for i in range(self.training_iters_per_batch):
q_targets = self.compute_q_target_values(batch)
y = Variable(q_targets)
loss = self.net.training_step(batch['states'], y)
print(f'loss {loss.data[0]}\n')
return loss.data[0]
else:
return None

def body_act_discrete(self, body, body_state):
return self.action_selection(
self.net,
body_state,
self.explore_var)

def update(self):
'''Update epsilon or boltzmann for policy after net training'''
epi = self.agent.agent_space.aeb_space.clock['e']
rise = self.explore_var_end - self.explore_var_start
slope = rise / float(self.explore_anneal_epi)
self.explore_var = max(
slope * epi + self.explore_var_start, self.explore_var_end)
9 changes: 8 additions & 1 deletion slm_lab/agent/algorithm/random.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ class Random(Algorithm):
Example Random agent that works in both discrete and continuous envs
'''

def post_body_init(self):
'''Initializes the part of algorithm needing a body to exist first.'''
pass

def body_act_discrete(self, body, body_state):
'''Random discrete action'''
body_action = np.random.randint(body.action_dim)
Expand All @@ -21,5 +25,8 @@ def body_act_continuous(self, body, body_state):
body_action = np.random.randn(body.action_dim)
return body_action

def update(self, reward, state, done):
def train(self):
return

def update(self):
return
3 changes: 3 additions & 0 deletions slm_lab/agent/memory/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@
The memory module
Contains different ways of storing an agents experiences and sampling from them
'''

# expose all the classes
from .replay import *

0 comments on commit 425f3a1

Please sign in to comment.