Merge c1951f4 into 2d4a793

liquidcarrot · Nov 6, 2019 · ffcabe6 · ffcabe6
2 parents 2d4a793 + c1951f4
commit ffcabe6
Show file tree

Hide file tree

Showing 7 changed files with 187 additions and 156 deletions.
diff --git a/src/architecture/rl/dqn.js b/src/architecture/rl/dqn.js
@@ -1,74 +1,74 @@
 const architect = require('../architect');
 const Network = require('../network');
-const Window = require("../../util/window");
+const ReplayBuffer = require('./replay-buffer');
+const Experience = require('./experience');
 const Rate = require("../../methods/rate");
 
 
 /**
  * This function will get the value from the fieldName, if Present, otherwise returns the defaultValue
  * @param {Object} opt
  * @param {String} fieldName
- * @param {number|boolean} defaultValue
- * @return {Number | number[]} the value of the fileName if Present, otherwise the defaultValue
+ * @param {number | boolean | number[]} defaultValue
+ * @return {Number | number[] | boolean} the value of the fileName if Present, otherwise the defaultValue
  */
-function getopt(opt, fieldName, defaultValue) {
+function getOption(opt, fieldName, defaultValue) {
   if (typeof opt === 'undefined') {
     return defaultValue;
   }
   return (typeof opt[fieldName] !== 'undefined') ? opt[fieldName] : defaultValue;
 }
 
 /**
-* Creates a DQN network
-*
-* Used to do reinforcement learning
-*
-* @alpha
-*
-* @constructs DQN
-*
-* @param {int} numActions Maximum number of actions the agent can do,
-* @param {int} numStates Length of the state array
-* @param {Object} options Options object
-* 
-* @todo Allow underlying Network to have arbitrary layer structure
-* @todo Add test & custom network input / output size validation
+ * Creates a DQN network
+ *
+ * Used to do reinforcement learning
+ *
+ * @alpha
+ *
+ * @constructs DQN
+ *
+ * @param {int} numActions Maximum number of actions the agent can do,
+ * @param {int} numStates Length of the state array
+ * @param {Object} options Options object
+ *
+ * @todo Allow underlying Network to have arbitrary layer structure
+ * @todo Add test & custom network input / output size validation
+ * @todo Maybe automatically suggest default values for the num of states and actions
 */
-function DQN(numActions, numStates, opt) {
+function DQN(numStates, numActions, options) {
   // Network Sizing
-  this.numStates = numStates;
   this.numActions = numActions;
-  this.hiddenNeurons = getopt(opt, 'hiddenNeurons', [10]);
-  this.network = getopt(opt, 'network', new architect.Perceptron(numStates, ...this.hiddenNeurons, numActions));
+  this.hiddenNeurons = getOption(options, 'hiddenNeurons', [10]);
+  this.network = getOption(options, 'network', new architect.Perceptron(numStates, ...this.hiddenNeurons, numActions));
 
   // Network & state memory
   this.reward = null;
   this.state = null;
   this.nextState = null;
   this.action = null;
-  this.nextAction = null;
 
   // Learning and update
-  this.learningRate = getopt(opt, 'learningRate', 0.1); // AKA alpha value function learning rate
-  this.learningRateDecay = getopt(opt, 'learningRateDecay', 0.99); // AKA alpha value function learning rate
-  this.learningRateMin = getopt(opt, 'learningRateMin', 0.01); // AKA alpha value function learning rate
+  this.learningRate = getOption(options, 'learningRate', 0.1); // AKA alpha value function learning rate
+  this.learningRateDecay = getOption(options, 'learningRateDecay', 0.99); // AKA alpha value function learning rate
+  this.learningRateMin = getOption(options, 'learningRateMin', 0.01); // AKA alpha value function learning rate
   this.loss = 0;
-  this.tderrorClamp = getopt(opt, 'tderrorClamp', 1);
-  this.isTraining = getopt(opt, 'isTraining', true);
+  this.tderrorClamp = getOption(options, 'tderrorClamp', 1);
+  this.isTraining = getOption(options, 'isTraining', true);
 
   // Experience Replay
-  let experienceSize = getopt(opt, 'experience_size', 50000); // size of experience replay
-  this.experience = new Window(experienceSize, true); // experience
-  this.learningStepsPerIteration = getopt(opt, 'learning_steps_per_iteration', 20); // number of time steps before we add another experience to replay memory
+  let experienceSize = getOption(options, 'experience_size', 50000); // size of experience replay
+  this.experience = new ReplayBuffer(experienceSize); // experience
+  this.learningStepsPerIteration = getOption(options, 'learning_steps_per_iteration', 20); // number of time steps before we add another experience to replay memory
   this.timeStep = 0;
 
   // Exploration / Exploitation management
-  this.explore = getopt(opt, 'explore', 0.05); // AKA epsilon for epsilon-greedy policy
-  this.exploreDecay = getopt(opt, 'exploreDecay', 0.99); // AKA epsilon for epsilon-greedy policy
-  this.exploreMin = getopt(opt, 'exploreMin', 0); // AKA epsilon for epsilon-greedy policy
+  this.explore = getOption(options, 'explore', 0.3); // AKA epsilon for epsilon-greedy policy
+  this.exploreDecay = getOption(options, 'exploreDecay', 0.9999); // AKA epsilon for epsilon-greedy policy
+  this.exploreMin = getOption(options, 'exploreMin', 0.01); // AKA epsilon for epsilon-greedy policy
 
   // Reward calculation
-  this.gamma = getopt(opt, 'gamma', 0.1); // future reward discount factor
+  this.gamma = getOption(options, 'gamma', 0.7); // future reward discount factor
 }
 
 DQN.prototype = {
@@ -78,7 +78,7 @@ DQN.prototype = {
    * @function toJSON
    * @memberof DQN
    *
-   * @return {JSON} JSON String which represents the current DQN agent
+   * @return {{net:{input:{number},output:{number},dropout:{number},nodes:Array<object>,connections:Array<object>},gamma:{number},explore:{number},exploreDecay:{number},exploreMin:{number},learningRate:{number},learningRateDecay:{number},learningRateMin:{number},isTraining:{boolean},experience:{ReplayBuffer}}} json JSON String JSON String which represents this DQN agent
    */
   toJSON: function () {
     let json = {};
@@ -109,16 +109,16 @@ DQN.prototype = {
    * @memberof DQN
    *
    * @param {number[]} state current state (float arr with values between 0 and 1)
-   * @returns {number} The action which the DQN would take at this state (represented by an index)
+   * @return {number} The action which the DQN would take at this state (represented by an index)
    *
    * @todo Add ability to select strategies
    * @todo Add Thompson Sampling strategy
    */
   act: function (state) {
-    // epsilon greedy strategy | explore > random = explore; else exploit
+    // epsilon greedy strategy | explore > random ? explore : otherwise exploit
     const action = (Math.max(this.exploreMin, Rate.EXP(this.explore, this.timeStep, {gamma: this.exploreDecay})) > Math.random())
       ? Math.floor(Math.random() * this.numActions) // random "explore" action
-      : this.getMaxValueIndex(this.network.activate(state)) // deliberate "exploit" action
+      : this.getMaxValueIndex(this.network.activate(state)); // deliberate "exploit" action
 
     // shift state memory
     this.state = this.nextState;
@@ -136,22 +136,27 @@ DQN.prototype = {
    * @memberof DQN
    *
    * @param {number} newReward the current reward, the agent receives from the environment
+   * @param {boolean} isFinalState Does the game ends at this state?
    * @returns {number} the loss value
    *
    * @todo Add prioritized experience replay
    * @todo Add hindsight experience replay
    */
-  learn: function (newReward) {
+  learn: function(newReward, isFinalState = false) {
     // Update Q function | temporal difference method currently hardcoded
     if (this.reward != null && this.isTraining) {
+      let experience = new Experience(this.state, this.action, this.reward, this.nextState, isFinalState);
       // Learn from current estimated reward to understand how wrong agent is
-      this.loss = this.study(this.state, this.action, this.reward, this.nextState);
+      let loss = this.study(experience);
+      experience.loss = loss;
+      this.loss = loss;
 
       // Too random, should pick experiences by their loss value
-      this.experience.add([this.state, this.action, this.reward, this.nextState, this.loss]);
+      this.experience.add(experience);
 
-      for (let i = 0; i < this.learningStepsPerIteration; i++) {
-        this.study(...this.experience.pickRandom());
+      let miniBatch = this.experience.getRandomMiniBatch(this.learningStepsPerIteration);
+      for (let i = 0; i < miniBatch.length; i++) {
+        this.study(miniBatch[i]);
       }
     }
     this.timeStep++;
@@ -165,35 +170,36 @@ DQN.prototype = {
    * @function study
    * @memberof DQN
    *
-   * @param {number[]} state current state
-   * @param {number} action action taken in current state
-   * @param {number} reward reward received for the action in the current state
-   * @param {number[]} nextState the state which follows the current state with the action taken
+   * @param {Experience} experience the experience to learn from
    * @returns {number} TDError Roughly, an experiential measure of surprise / insight for the network at this state-action.
    *
    * @todo Add dynamic loss functions & clamps, including Huber Loss
    * @todo Add target network to increase reliability
    * @todo Consider not using a target network: https://www.ijcai.org/proceedings/2019/0379.pdf
    */
-  study: function (state, action, reward, nextState) {
+  study: function(experience) {
     // Compute target Q value, called without traces so it won't affect backprop
-    const nextActions = this.network.activate(nextState, {no_trace: true});
+    const nextActions = this.network.activate(experience.nextState, {no_trace: true});
 
     // Q(s,a) = r + gamma * max_a' Q(s',a')
-    const targetQValue = (1 + reward) / 2 + this.gamma * nextActions[this.getMaxValueIndex(nextActions)];
+    let normalizedReward = (1 + experience.reward) / 2;
+    let targetQValue;
+    targetQValue = experience.isFinalState
+      ? normalizedReward
+      : normalizedReward + this.gamma * nextActions[this.getMaxValueIndex(nextActions)];
 
     // Predicted current reward | called with traces for backprop later
-    const predictedReward = this.network.activate(state);
+    const predictedReward = this.network.activate(experience.state);
 
-    let tdError = predictedReward[action] - targetQValue;
+    let tdError = predictedReward[experience.action] - targetQValue;
 
-    // Clamp error for robustness | ToDo: huber loss
+    // Clamp error for robustness
     if (Math.abs(tdError) > this.tderrorClamp) {
       tdError = tdError > this.tderrorClamp ? this.tderrorClamp : -this.tderrorClamp;
     }
 
     // Backpropagation using temporal difference error
-    predictedReward[action] = targetQValue;
+    predictedReward[experience.action] = targetQValue;
     this.network.propagate(Math.max(this.learningRateMin, Rate.EXP(this.learningRate, this.timeStep, {gamma: this.learningRateDecay})), 0, true, predictedReward);
     return tdError;
   },
@@ -219,19 +225,6 @@ DQN.prototype = {
       }
     }
     return index;
-  },
-
-  /**
-   * Setter for variable "isTraining"
-   *
-   * @function setTraining
-   * @memberof DQN
-   *
-   * @param val new value
-   * @todo Consider removing
-   */
-  setTraining: function (val) {
-    this.isTraining = val;
   }
 };
 
@@ -241,12 +234,12 @@ DQN.prototype = {
  * @function fromJSON
  * @memberof DQN
  *
- * @param {JSON} json  JSON String
+ * @param {{net:{input:{number},output:{number},dropout:{number},nodes:Array<object>,connections:Array<object>},gamma:{number},explore:{number},exploreDecay:{number},exploreMin:{number},learningRate:{number},learningRateDecay:{number},learningRateMin:{number},isTraining:{boolean},experience:{Window}}} json  JSON String
  * @return {DQN} Agent with the specs from the json
  */
 DQN.fromJSON = function (json) {
-  let network = Network.fromJSON(json);
-  let agent = new DQN(network.input_size, network.output_size, {});
+  let network = Network.fromJSON(json.net);
+  let agent = new DQN(network.input_size, network.output_size, {network: network});
 
   agent.gamma = json.gamma;
   agent.explore = json.explore;

diff --git a/src/architecture/rl/experience.js b/src/architecture/rl/experience.js
@@ -0,0 +1,20 @@
+/**
+ * Creates an experience object
+ *
+ * @param state the current state
+ * @param action the current action
+ * @param reward the reward for the current action in the current state
+ * @param nextState the state following by the current action in the current state
+ * @param isFinalState Does the game ends at this state?
+ * @constructor
+ */
+function Experience(state, action, reward, nextState, isFinalState) {
+  this.state = state;
+  this.action = action;
+  this.reward = reward;
+  this.nextState = nextState;
+  this.isFinalState = isFinalState;
+  this.loss = 0;
+}
+
+module.exports = Experience;
diff --git a/src/architecture/rl/replay-buffer.js b/src/architecture/rl/replay-buffer.js
@@ -0,0 +1,52 @@
+const Experience = require('./experience');
+
+/**
+ * Creates a replay buffer with a maximum size of experience entries.
+ *
+ * @param maxSize maximum number of experiences
+ * @constructor
+ */
+function ReplayBuffer(maxSize) {
+  this.buffer = [];
+  this.maxSize = maxSize;
+}
+
+ReplayBuffer.prototype = {
+  /**
+   * Adds an experience entry to the buffer.
+   *
+   * @param {Experience} experience the experience to add
+   */
+  add: function(experience) {
+    if (this.buffer.length >= this.maxSize) {
+      this.buffer.shift();
+    }
+    this.buffer.push(experience);
+  },
+
+  /**
+   * Get a random mini batch of given size.
+   *
+   * @param {number} size the size of the minibatch.
+   *
+   * @returns {Experience[]} a batch of Experiences to train from.
+   */
+  getRandomMiniBatch: function(size) {
+    //Size can't be bigger than this.buffer.length
+    size = Math.min(size, this.buffer.length);
+    if (size === this.buffer.length) {
+      return this.buffer;
+    }
+
+    let bufferCopy = [...this.buffer];
+    let batch = [];
+
+    for (let i = 0; i < size; i++) {
+      //Add an random experience to the batch and remove it from the bufferCopy
+      batch.push(...bufferCopy.splice(Math.floor(Math.random() * bufferCopy.length), 1));
+    }
+    return batch;
+  },
+};
+
+module.exports = ReplayBuffer;
diff --git a/src/util/window.js b/src/util/window.js