DQN first full agent integration with Lab (#24)

* Vanilla dqn implementation * Indexing fix * Adding comments, style fixes * DQN change of init notes * Fixing file structure and function naming * Changing DQN init, and starting setup for more general DQN implementation * Adding absolute timestep to train a batch * update README * fix data_space bool * rename top level data space to info space * rename AEBDataSpace to DataSpace * add clock to aebspace * fix comment and import error for dqn work * update DQNBase init and variable names * add missing action back to update() * rename terminal to done, indices to idxs * swap next_state and done ordering; reorder method, fix var not declared * add proper memory reset state method * rename memory to replay.Replay, propagate, update init method * temporarily fragment memory class to pass tests * tmp fix conftest, mute memory test for now * introduce generic post_body_init to fix body-dependent init * refactor memory methods to abstract class * update spec setting in agent, env * redesign spec, update spec_util and guard env spec defaulting * pass unity env config, update spec * remove timestep from meta spec, update dqn spec * fix DQN agent init to get working * make post_body_init abstract required method * use new update method, call algo update * add algo train method, propagate. random agent working * fix memory numpy spread value, activate memory * fix CC * rename algorithm_util * refactor test_memory * fix algorithm, explore_anneal_epi * fix spec for multi_env * fix control and spec * fix dqn, running now! * add TODO
kengz · Dec 3, 2017 · 425f3a1 · 425f3a1
1 parent cd37046
commit 425f3a1
Show file tree

Hide file tree

Showing 26 changed files with 857 additions and 343 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # SLM Lab [![CircleCI](https://circleci.com/gh/kengz/SLM-Lab.svg?style=shield)](https://circleci.com/gh/kengz/SLM-Lab) [![Maintainability](https://api.codeclimate.com/v1/badges/20c6a124c468b4d3e967/maintainability)](https://codeclimate.com/github/kengz/SLM-Lab/maintainability) [![Test Coverage](https://api.codeclimate.com/v1/badges/20c6a124c468b4d3e967/test_coverage)](https://codeclimate.com/github/kengz/SLM-Lab/test_coverage)
-_(Work In Progress)_ An experimental framework for Reinforcement Learning using Unity and PyTorch.
+_(Work In Progress)_ An experimentation framework for Reinforcement Learning using Unity and PyTorch.
 
 ## Installation
 
@@ -123,7 +123,6 @@ When controlling a session of experiment, execute the agent and environment logi
 Hence, the experiment session loop generalizes directly from:
 ```
 state = self.env.reset()
-logger.debug(f'reset state {state}')
 self.agent.reset()
 # RL steps for SARS
 for t in range(self.env.max_timestep):
@@ -144,9 +143,11 @@ self.agent_space.reset()
 # RL steps for SARS
 for t in range(self.env_space.max_timestep):
     action_space = self.agent_space.act(state_space)
-    reward_space, state_space, done_space = self.env_space.step(action_space)
-    # fully observable SARS from env_space, memory and training internally
-    self.agent_space.update(reward_space, state_space)
-    if done_space.done():
+    logger.debug(f'action_space {action_space}')
+    (reward_space, state_space,
+     done_space) = self.env_space.step(action_space)
+    # completes cycle of full info for agent_space
+    self.agent_space.update(reward_space, state_space, done_space)
+    if bool(done_space):
         break
 ```
diff --git a/main.py b/main.py
@@ -5,8 +5,10 @@
 
 
 def main():
-    logger.set_level('DEBUG')
-    spec = spec_util.get('base.json', 'general_custom')
+    # logger.set_level('DEBUG')
+    # spec = spec_util.get('base.json', 'base_case')
+    # spec = spec_util.get('base.json', 'multi_env')
+    spec = spec_util.get('dqn.json', 'dqn_base_case')
     trial = Trial(spec)
     trial_data = trial.run()
 

diff --git a/slm_lab/agent/__init__.py b/slm_lab/agent/__init__.py
@@ -15,13 +15,12 @@
 - high level properties of thinking, e.g. creativity, planning.
 
 Agent components:
-- algorithm
+- algorithm (with net, policy)
 - memory
-- net
-- policy
 '''
-from slm_lab.agent import algorithm
-from slm_lab.experiment.monitor import data_space
+import pydash as _
+from slm_lab.agent import algorithm, memory
+from slm_lab.experiment.monitor import info_space
 from slm_lab.lib import util
 
 
@@ -35,37 +34,38 @@ class Agent:
 
     def __init__(self, spec, agent_space, a=0):
         self.spec = spec
-        util.set_attr(self, self.spec)
+        self.name = self.spec['name']
         self.agent_space = agent_space
         self.index = a
         self.eb_proj = self.agent_space.a_eb_proj[self.index]
         self.bodies = None  # consistent with ab_proj, set in aeb_space.init_body_space()
 
-        AlgoClass = getattr(algorithm, self.name)
+        # TODO repattern, redesign spec
+        AlgoClass = getattr(algorithm, _.get(self.spec, 'algorithm.name'))
         self.algorithm = AlgoClass(self)
-        # TODO also resolve architecture and data input, output dims via some architecture spec
-        self.memory = None
-        self.net = None
+        MemoryClass = getattr(memory, _.get(self.spec, 'memory.name'))
+        self.memory = MemoryClass(self)
 
-    def reset(self):
+    def post_body_init(self):
+        '''Run init for components that need bodies to exist first, e.g. memory or architecture.'''
+        self.memory.post_body_init()
+        self.algorithm.post_body_init()
+
+    def reset(self, state):
         '''Do agent reset per episode, such as memory pointer'''
-        # TODO implement
-        return
+        self.memory.reset_last_state(state)
 
     def act(self, state):
         '''Standard act method from algorithm.'''
         return self.algorithm.act(state)
 
-    def update(self, reward, state, done):
+    def update(self, action, reward, state, done):
         '''
         Update per timestep after env transitions, e.g. memory, algorithm, update agent params, train net
         '''
-        # TODO build and access timestep, episode, absolute number of timesteps from Dataspace
-        # TODO implement generic method, work on AEB
-        # self.memory.update()
-        # self.net.train()
-        self.algorithm.update(reward, state, done)
-        return
+        self.memory.update(action, reward, state, done)
+        self.algorithm.train()
+        self.algorithm.update()
 
     def close(self):
         '''Close agent at the end of a session, e.g. save model'''
@@ -88,12 +88,18 @@ def __init__(self, spec, aeb_space):
         self.agents = [Agent(a_spec, self, a)
                        for a, a_spec in enumerate(spec['agent'])]
 
+    def post_body_init(self):
+        '''Run init for components that need bodies to exist first, e.g. memory or architecture.'''
+        for agent in self.agents:
+            agent.post_body_init()
+
     def get(self, a):
         return self.agents[a]
 
-    def reset(self):
-        for agent in self.agents:
-            agent.reset()
+    def reset(self, state_space):
+        for a, agent in enumerate(self.agents):
+            state = state_space.get(a=a)
+            agent.reset(state)
 
     def act(self, state_space):
         action_proj = []
@@ -104,12 +110,13 @@ def act(self, state_space):
         action_space = self.aeb_space.add('action', action_proj)
         return action_space
 
-    def update(self, reward_space, state_space, done_space):
+    def update(self, action_space, reward_space, state_space, done_space):
         for a, agent in enumerate(self.agents):
+            action = action_space.get(a=a)
             reward = reward_space.get(a=a)
             state = state_space.get(a=a)
             done = done_space.get(a=a)
-            agent.update(reward, state, done)
+            agent.update(action, reward, state, done)
 
     def close(self):
         for agent in self.agents:

diff --git a/slm_lab/agent/algorithm/__init__.py b/slm_lab/agent/algorithm/__init__.py
@@ -6,3 +6,4 @@
 
 # expose all the classes
 from .random import *
+from .dqn import *
diff --git a/slm_lab/agent/algorithm/algorithm_util.py b/slm_lab/agent/algorithm/algorithm_util.py
@@ -0,0 +1,60 @@
+import numpy as np
+import torch
+from torch.autograd import Variable
+
+'''Functions used by more than one algorithm'''
+
+
+def act_with_epsilon_greedy(net, state, epsilon):
+    '''
+    With probability episilon select a random action,
+    otherwise select the action associated with the
+    largest q value
+    '''
+    # TODO store one hot
+    # TODO discrete int
+    a_dim = net.out_dim
+    print(f'epsilon {epsilon}')
+    if epsilon > np.random.rand():
+        print('random action')
+        action = np.random.randint(a_dim)
+    else:
+        print('net action')
+        torch_state = Variable(torch.from_numpy(state).float())
+        out = net.wrap_eval(torch_state)
+        action = int(torch.max(out, dim=0)[1][0])
+    return action
+
+
+def act_with_boltzmann(net, state, tau):
+    # TODO implement act_with_boltzmann
+    pass
+
+
+def act_with_gaussian(net, state, stddev):
+    # TODO implement act_with_gaussian
+    pass
+
+
+def update_epsilon_greedy(net, state, stddev):
+    # TODO implement act_with_gaussian
+    pass
+
+
+def update_boltzmann(net, state, stddev):
+    # TODO implement act_with_gaussian
+    pass
+
+
+def update_gaussian(net, state, stddev):
+    # TODO implement act_with_gaussian
+    pass
+
+
+act_fns = {'epsilon_greedy': act_with_epsilon_greedy,
+           'boltzmann': act_with_boltzmann,
+           'gaussian': act_with_gaussian}
+
+update_fns = {'epsilon_greedy': update_epsilon_greedy,
+              'boltzmann': update_boltzmann,
+              'gaussian': update_gaussian}
diff --git a/slm_lab/agent/algorithm/base.py b/slm_lab/agent/algorithm/base.py
@@ -12,12 +12,15 @@ def __init__(self, agent):
         self.agent = agent
 
     @abstractmethod
+    def post_body_init(self):
+        '''Initializes the part of algorithm needing a body to exist first.'''
+        raise NotImplementedError
+
     def body_act_discrete(self, body, body_state):
         '''Implement atomic discrete body_action, or throw NotImplementedError. E.g. fetch body_action from net given body info.'''
         raise NotImplementedError
         return body_action
 
-    @abstractmethod
     def body_act_continuous(self, body, body_state):
         '''Implement atomic continuous body_action, or throw NotImplementedError. E.g. fetch body_action from net given body info.'''
         raise NotImplementedError
@@ -40,6 +43,11 @@ def act(self, state):
         return action
 
     @abstractmethod
-    def update(self, reward, state, done):
+    def train(self):
+        '''Implement algorithm train, or throw NotImplementedError'''
+        raise NotImplementedError
+
+    @abstractmethod
+    def update(self):
         '''Implement algorithm update, or throw NotImplementedError'''
         raise NotImplementedError
diff --git a/slm_lab/agent/algorithm/dqn.py b/slm_lab/agent/algorithm/dqn.py
@@ -0,0 +1,115 @@
+import numpy as np
+import torch
+from torch.autograd import Variable
+from slm_lab.agent.algorithm.base import Algorithm
+from slm_lab.agent.algorithm.algorithm_util import act_fns, update_fns
+from slm_lab.agent.net import nets
+from slm_lab.agent.memory import Replay
+
+
+class DQNBase(Algorithm):
+    '''
+    Implementation of the base DQN algorithm.
+    See Sergey Levine's lecture xxx for more details
+    TODO add link
+          more detailed comments
+
+    net: instance of an slm_lab/agent/net
+    memory: instance of an slm_lab/agent/memory
+    batch_size: how many examples from memory to sample at each training step
+    action_selection: function (from common.py) that determines how to select actions
+    gamma: Real number in range [0, 1]. Determines how much to discount the future
+    state_dim: dimension of the state space
+    action_dim: dimensions of the action space
+    '''
+
+    def __init__(self, agent):
+        super(DQNBase, self).__init__(agent)
+
+    def post_body_init(self):
+        '''Initializes the part of algorithm needing a body to exist first.'''
+        # TODO generalize
+        default_body = self.agent.bodies[0]
+        state_dim = default_body.state_dim
+        action_dim = default_body.action_dim
+        net_spec = self.agent.spec['net']
+        net_spec['net_layer_params'][0] = state_dim
+        net_spec['net_layer_params'][-1] = action_dim
+        self.net = nets[net_spec['net_type']](
+            *net_spec['net_layer_params'],
+            *net_spec['net_other_params'])
+        # TODO three nets for different part of Q function
+        # In base algorithm should all be pointer to the same net - then update compute q target values and action functions
+        self.batch_size = net_spec['batch_size']
+        self.gamma = net_spec['gamma']
+
+        algorithm_spec = self.agent.spec['algorithm']
+        self.action_selection = act_fns[algorithm_spec['action_selection']]
+
+        # explore_var is epsilon, tau or etc.
+        self.explore_var_start = algorithm_spec['explore_var_start']
+        self.explore_var_end = algorithm_spec['explore_var_end']
+        self.explore_var = self.explore_var_start
+        self.explore_anneal_epi = algorithm_spec['explore_anneal_epi']
+        self.training_iters_per_batch = 1
+        self.training_frequency = 1
+
+    def compute_q_target_values(self, batch):
+        # Make future reward 0 if the current state is done
+        float_data_list = [
+            'states', 'actions', 'rewards', 'dones', 'next_states']
+        for k in float_data_list:
+            batch[k] = Variable(torch.from_numpy(batch[k]).float())
+        # print('batch')
+        # print(batch['states'])
+        # print(batch['actions'])
+        # print(batch['rewards'])
+        # print(batch['dones'])
+        # print(1 - batch['dones'])
+        q_vals = self.net.wrap_eval(batch['states'])
+        # print(f'q_vals {q_vals}')
+        q_targets_all = batch['rewards'].data + self.gamma * \
+            torch.mul((1 - batch['dones'].data),
+                      self.net.wrap_eval(batch['next_states']))
+        # print(f'q_targets_all {q_targets_all}')
+        q_targets_max, _ = torch.max(q_targets_all, dim=1)
+        # print(f'q_targets_max {q_targets_max}')
+        # print(f'q_targets_all size {q_targets_all.size()}')
+
+        # We only want to train the network for the action selected
+        # For all other actions we set the q_target = q_vals
+        # So that the loss for these actions is 0
+        q_targets_max.unsqueeze_(1)
+        # print(f'q_targets_max {q_targets_max}')
+        q_targets = torch.mul(q_targets_max, batch['actions'].data) + \
+            torch.mul(q_vals, (1 - batch['actions'].data))
+        # print(f'q_targets {q_targets}')
+        return q_targets
+
+    def train(self):
+        # TODO Fix for training iters, docstring
+        t = self.agent.agent_space.aeb_space.clock['t']
+        if t % self.training_frequency == 0:
+            batch = self.agent.memory.get_batch(self.batch_size)
+            for i in range(self.training_iters_per_batch):
+                q_targets = self.compute_q_target_values(batch)
+                y = Variable(q_targets)
+                loss = self.net.training_step(batch['states'], y)
+                print(f'loss {loss.data[0]}\n')
+            return loss.data[0]
+        else:
+            return None
+
+    def body_act_discrete(self, body, body_state):
+        return self.action_selection(
+            self.net,
+            body_state,
+            self.explore_var)
+
+    def update(self):
+        '''Update epsilon or boltzmann for policy after net training'''
+        epi = self.agent.agent_space.aeb_space.clock['e']
+        rise = self.explore_var_end - self.explore_var_start
+        slope = rise / float(self.explore_anneal_epi)
+        self.explore_var = max(
+            slope * epi + self.explore_var_start, self.explore_var_end)
diff --git a/slm_lab/agent/algorithm/random.py b/slm_lab/agent/algorithm/random.py
@@ -11,6 +11,10 @@ class Random(Algorithm):
     Example Random agent that works in both discrete and continuous envs
     '''
 
+    def post_body_init(self):
+        '''Initializes the part of algorithm needing a body to exist first.'''
+        pass
+
     def body_act_discrete(self, body, body_state):
         '''Random discrete action'''
         body_action = np.random.randint(body.action_dim)
@@ -21,5 +25,8 @@ def body_act_continuous(self, body, body_state):
         body_action = np.random.randn(body.action_dim)
         return body_action
 
-    def update(self, reward, state, done):
+    def train(self):
+        return
+
+    def update(self):
         return
diff --git a/slm_lab/agent/memory/__init__.py b/slm_lab/agent/memory/__init__.py
@@ -2,3 +2,6 @@
 The memory module
 Contains different ways of storing an agents experiences and sampling from them
 '''
+
+# expose all the classes
+from .replay import *