Merge fef8921 into 0874fff

liquidcarrot · Mar 6, 2020 · 501893c · 501893c
2 parents 0874fff + fef8921
commit 501893c
Show file tree

Hide file tree

Showing 25 changed files with 22,714 additions and 16,779 deletions.
diff --git a/dist/carrot.amd.js b/dist/carrot.amd.js
diff --git a/dist/carrot.amd.min.js b/dist/carrot.amd.min.js
diff --git a/dist/carrot.commonjs2.js b/dist/carrot.commonjs2.js
diff --git a/dist/carrot.commonjs2.min.js b/dist/carrot.commonjs2.min.js
diff --git a/dist/carrot.umd2.js b/dist/carrot.umd2.js
diff --git a/dist/carrot.umd2.min.js b/dist/carrot.umd2.min.js
diff --git a/dist/carrot.window.js b/dist/carrot.window.js
diff --git a/dist/carrot.window.min.js b/dist/carrot.window.min.js
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -67,23 +67,24 @@
   },
   "homepage": "https://liquidcarrot.io/",
   "dependencies": {
-    "lodash": "^4.17.15"
+    "lodash": "^4.17.15",
+    "riteway": "^6.1.1"
   },
   "devDependencies": {
     "chai": "^4.2.0",
     "chai-as-promised": "^7.1.1",
     "chai-each": "0.0.1",
     "chalk": "^2.4.2",
-    "copy-webpack-plugin": "^5.0.4",
-    "coveralls": "^3.0.6",
+    "copy-webpack-plugin": "^5.1.1",
+    "coveralls": "^3.0.9",
     "faker": "^4.1.0",
     "jsdoc": "^3.6.3",
-    "mocha": "^6.2.0",
-    "nodemon": "^1.19.2",
+    "mocha": "^6.2.2",
+    "nodemon": "^1.19.4",
     "nyc": "^13.3.0",
     "parallel-webpack": "^2.4.0",
-    "webpack": "^4.41.0",
-    "webpack-cli": "^3.3.9"
+    "webpack": "^4.41.2",
+    "webpack-cli": "^3.3.10"
   },
   "nyc": {
     "include": [

diff --git a/src/architecture/connection.js b/src/architecture/connection.js
@@ -49,6 +49,7 @@ function Connection (from, to, weight, options) {
     delta_weights_total: 0,
     delta_weights: [],
     xtrace_nodes: [],
+    sharedIncoming: null,
     xtrace_values: []
   }, options, { from, to, weight});
 

diff --git a/src/architecture/network.js b/src/architecture/network.js
diff --git a/src/architecture/node.js b/src/architecture/node.js
@@ -62,11 +62,12 @@ function Node(options) {
     incoming: [],
     outgoing: [],
     gated: [],
+    sharedIncoming: null,
     connections_self: new Connection(self, self, 0),
     error_responsibility: 0,
     error_projected: 0,
     error_gated: 0,
-    ...options
+    ...options,
   })
 
   /**
@@ -116,6 +117,9 @@ function Node(options) {
     // DRY abstraction
     const activate = function() {
       // Activate (from self)
+      if (self.sharedIncoming !== null) {
+        self.bias = self.sharedIncoming.bias;
+      }
       self.state = self.connections_self.gain * self.connections_self.weight * self.state + self.bias;
 
       // Activate (from incoming connections)
@@ -124,7 +128,7 @@ function Node(options) {
         self.state += conn.from.activation * conn.weight * conn.gain;
       }
 
-      return self.state
+      return self.state;
     }
 
     if(options.trace) {
@@ -281,7 +285,6 @@ function Node(options) {
       self.error_projected = 0;
       for (let i = 0; i < self.outgoing.length; i++) {
         const connection = self.outgoing[i];
-
         self.error_projected += connection.to.error_responsibility * connection.weight * connection.gain;
       }
       self.error_projected *= self.derivative || 1;
@@ -293,7 +296,7 @@ function Node(options) {
         const node = connection.to;
         const influence = (node.connections_self.gater === self ? node.old : 0) + connection.weight * connection.from.activation;
 
-        self.error_gated += node.error_reponsibility * influence;
+        self.error_gated += node.error_responsibility * influence;
       }
       self.error_gated *= self.derivative || 1;
 
@@ -314,7 +317,9 @@ function Node(options) {
       connection.delta_weights_total += options.rate * gradient * self.mask;
       if (options.update) {
         connection.delta_weights_total += options.momentum * connection.delta_weights_previous;
-        connection.weight += connection.delta_weights_total;
+        if (connection.sharedIncoming === null) {
+          connection.weight += connection.delta_weights_total;
+        }
         connection.delta_weights_previous = connection.delta_weights_total;
         connection.delta_weights_total = 0;
       }
@@ -324,7 +329,9 @@ function Node(options) {
     self.delta_bias_total += options.rate * self.error_responsibility;
     if (options.update) {
       self.delta_bias_total += options.momentum * self.delta_bias_previous;
-      self.bias += self.delta_bias_total;
+      if (self.sharedIncoming === null) {
+        self.bias += self.delta_bias_total;
+      }
       self.delta_bias_previous = self.delta_bias_total;
       self.delta_bias_total = 0;
     }
@@ -699,11 +706,16 @@ function Node(options) {
 
     switch(options.method) {
       case methods.mutation.MOD_ACTIVATION:
-        if(options.allowed) self.squash = options.allowed[random_index(options.allowed.length, options.allowed.indexOf(self.squash))];
-        else self.squash = methods.activation[random_key(Object.keys(methods.activation), self.squash.name)]
+        if (options.allowed) {
+          self.squash = options.allowed[random_index(options.allowed.length, options.allowed.indexOf(self.squash))];
+        } else {
+          self.squash = methods.activation[random_key(Object.keys(methods.activation), self.squash.name)];
+        }
         break;
       case methods.mutation.MOD_BIAS:
-        self.bias += Math.random() * (options.method.max - options.method.min) + options.method.min;
+        if (self.sharedIncoming === null) {
+          self.bias += Math.random() * (options.method.max - options.method.min) + options.method.min;
+        }
         break;
     }
   },
@@ -850,7 +862,8 @@ function Node(options) {
       bias: self.bias,
       type: self.type,
       squash: self.squash.name,
-      mask: self.mask
+      mask: self.mask,
+      shared: self.shared,
     };
   }
 }

diff --git a/src/architecture/rl/ddpg.js b/src/architecture/rl/ddpg.js
@@ -1,18 +1,16 @@
-const architect = require('../architect');
 const Network = require('../network');
 const ReplayBuffer = require('./replay-buffer');
 const Experience = require('./experience');
 const Utils = require('../../util/utils');
 const Rate = require('../../methods/rate');
-const Loss = require('../../methods/cost');
 
 /**
  *
  * Creates a DDPG-Agent
  *
  * Used to do reinforcement learning with an DDPG Agent
  *
- * @beta
+ * @alpha
  *
  * @constructs DDPG
  *
@@ -47,39 +45,39 @@ function DDPG(numStates, numActions, options) {
   let hiddenNeuronsActor = Utils.RL.getOption(options, 'hiddenNeuronsActor', [10]);
   let hiddenNeuronsCritic = Utils.RL.getOption(options, 'hiddenNeuronsCritic', hiddenNeuronsActor);
 
-  this.actor = Utils.RL.getOption(options, 'actor', new architect.Perceptron(numStates, hiddenNeuronsActor, numActions));
-  this.critic = Utils.RL.getOption(options, 'critic', new architect.Perceptron(numStates + numActions, hiddenNeuronsCritic, numActions));
+  this.actor = Utils.RL.getOption(options, 'actor', new Network.architecture.Perceptron(numStates, ...hiddenNeuronsActor, numActions));
+  this.critic = Utils.RL.getOption(options, 'critic', new Network.architecture.Perceptron(numStates + numActions, ...hiddenNeuronsCritic, numActions));
   this.actorTarget = Utils.RL.getOption(options, 'actorTarget', Network.fromJSON(this.actor.toJSON()));
   this.criticTarget = Utils.RL.getOption(options, 'criticTarget', Network.fromJSON(this.critic.toJSON()));
 
   // Experience ("Memory")
   let experienceSize = Utils.RL.getOption(options, 'experienceSize', 50000);
   let noisyPER = Utils.RL.getOption(options, 'noisyPER', null);
   this.replayBuffer = Utils.RL.getOption(options, 'replayBuffer', noisyPER === null
-    ? new ReplayBuffer(experienceSize)
+    ? new ReplayBuffer(experienceSize, 0)
     : new ReplayBuffer(experienceSize, noisyPER));
-  this.learningStepsPerIteration = Utils.RL.getOption(options, 'learningStepsPerIteration', 20);
+  this.learningStepsPerIteration = Utils.RL.getOption(options, 'learningStepsPerIteration', 100);
+  this.startLearningThreshold = Utils.RL.getOption(options, 'startLearningThreshold', 0);
 
   // Training specific variables
+  this.isContinuousTask = Utils.RL.getOption(options, 'isContinuousTask', false);
   this.gamma = Utils.RL.getOption(options, 'gamma', 0.7);
   this.theta = Utils.RL.getOption(options, 'theta', 0.01); // soft target update
-  this.criticLoss = Utils.RL.getOption(options, 'criticLoss', Loss.MSE);
-  this.criticLossOptions = Utils.RL.getOption(options, 'criticLossOptions', {});
   this.isTraining = Utils.RL.getOption(options, 'isTraining', true);
   this.isUsingPER = Utils.RL.getOption(options, 'isUsingPER', true); // using prioritized experience replay
 
   this.learningRateActor = Utils.RL.getOption(options, 'learningRateActor', 0.1); // AKA alpha value function learning rate
   this.learningRateActorDecay = Utils.RL.getOption(options, 'learningRateActorDecay', 0.99); // AKA alpha value function learning rate
-  this.learningRateActorMin = Utils.RL.getOption(options, 'learningRateActorMin', 0.01); // AKA alpha value function learning rate
+  this.learningRateActorMin = Utils.RL.getOption(options, 'learningRateActorMin', 0.005); // AKA alpha value function learning rate
 
-  this.learningRateCritic = Utils.RL.getOption(options, 'learningRateCritic', this.learningRateActor); // AKA alpha value function learning rate
-  this.learningRateCriticDecay = Utils.RL.getOption(options, 'learningRateCriticDecay', this.learningRateActorDecay); // AKA alpha value function learning rate
-  this.learningRateCriticMin = Utils.RL.getOption(options, 'learningRateCriticMin', this.learningRateActorMin); // AKA alpha value function learning rate
+  this.learningRateCritic = Utils.RL.getOption(options, 'learningRateCritic', 0.1); // AKA alpha value function learning rate
+  this.learningRateCriticDecay = Utils.RL.getOption(options, 'learningRateCriticDecay', 0.99); // AKA alpha value function learning rate
+  this.learningRateCriticMin = Utils.RL.getOption(options, 'learningRateCriticMin', 0.05); // AKA alpha value function learning rate
 
   // Exploration / Exploitation management
-  this.noiseStandardDeviation = Utils.RL.getOption(options, 'noiseStandardDeviation', 0.3); // AKA epsilon for epsilon-greedy policy
-  this.noiseStandardDeviationDecay = Utils.RL.getOption(options, 'noiseStandardDeviationDecay', 0.9999); // AKA epsilon for epsilon-greedy policy
-  this.noiseStandardDeviationMin = Utils.RL.getOption(options, 'noiseStandardDeviationMin', 0.01); // AKA epsilon for epsilon-greedy policy
+  this.noiseStandardDeviation = Utils.RL.getOption(options, 'noiseStandardDeviation', 0.1); // AKA epsilon for epsilon-greedy policy
+  this.noiseStandardDeviationDecay = Utils.RL.getOption(options, 'noiseStandardDeviationDecay', 0.99); // AKA epsilon for epsilon-greedy policy
+  this.noiseStandardDeviationMin = Utils.RL.getOption(options, 'noiseStandardDeviationMin', 0.05); // AKA epsilon for epsilon-greedy policy
 
   this.timeStep = 0;
   this.actions = [];
@@ -191,16 +189,29 @@ DDPG.prototype = {
    * @memberof DDPG
    *
    * @param {number[]} state current state (float arr with values from [0,1])
-   * @return {int} The action which the DQN would take at this state; action ∈ [0, this.numActions-1]
+   * @param {number[]} prohibitedActions all prohibited actions at this state
+   * @return {int|number[]} The action which the DQN would take at this state; action ∈ [0, this.numActions-1] or the complete action array with the QValues for continuous tasks
    */
-  act: function(state) {
+  act: function(state, prohibitedActions) {
+    if (prohibitedActions === undefined) {
+      prohibitedActions = [];
+    }
     let noiseFactor = Math.max(this.noiseStandardDeviationMin, Rate.EXP(this.noiseStandardDeviation, this.timeStep, {gamma: this.noiseStandardDeviationDecay}));
     let action = Utils.addGaussianNoiseToNetwork(this.actor, noiseFactor).activate(state);
+
+    if (this.startLearningThreshold > this.timeStep) {
+      for (let i = 0; i < action.length; i++) {
+        action[i] = Math.random();
+      }
+    }
+
+    for (let i = 0; i < prohibitedActions.length; i++) {
+      action[prohibitedActions[i]] = -1;
+    }
     this.actions = action;
     this.lastState = this.state;
     this.state = state;
-
-    return Utils.getMaxValueIndex(action);
+    return this.isContinuousTask ? action : Utils.getMaxValueIndex(action);
   },
 
   /**
@@ -214,27 +225,23 @@ DDPG.prototype = {
    * @returns {number} the loss value; loss ∈ [-1,1]
    */
   learn: function(newReward, isFinalState = false) {
-    // Normalizing newReward:
-    // newReward ∈ [-1,1] --> normalizedReward ∈ [0,1]
-    const normalizedReward = (1 + newReward) / 2;
-
     this.timeStep++;
-    if (this.timeStep === 1 || !this.isTraining) {
+    if (this.timeStep === 1 || !this.isTraining || this.startLearningThreshold > this.timeStep) {
+      this.lastReward = newReward;
       return 1;
     }
-    let experience = new Experience(this.lastState, this.actions, normalizedReward, this.state, 0, isFinalState);
+    let experience = new Experience(this.lastState, this.actions, this.lastReward, this.state, 0, isFinalState);
+    experience.loss = this.study(experience);
     this.replayBuffer.add(experience);
 
-    let loss = this.study(experience);
-
     let miniBatch = this.isUsingPER
       ? this.replayBuffer.getMiniBatchWithPER(this.learningStepsPerIteration)
       : this.replayBuffer.getRandomMiniBatch(this.learningStepsPerIteration);
-
     for (let i = 0; i < miniBatch.length; i++) {
       this.study(miniBatch[i]);
     }
-    return loss;
+    this.lastReward = newReward;
+    return experience.loss;
   },
 
   /**
@@ -247,49 +254,48 @@ DDPG.prototype = {
    * @returns {number} Actor loss value; loss ∈ [-1,1]
    */
   study: function(experience) {
+    if (experience.state === undefined || experience.state === null ||
+      experience.action === undefined || experience.action === null) {
+      return 0;
+    }
     let stateActionArr = experience.state.concat(experience.action);
-    let criticActivation = this.critic.activate(stateActionArr);
+    this.critic.activate(stateActionArr);
     let actorActivation = this.actor.activate(experience.state);
+    let actorTargetActivation = this.actorTarget.activate(experience.nextState, {trace: false});
 
-    let nextQ = this.criticTarget.activate(experience.nextState.concat(this.actorTarget.activate(experience.nextState, {no_trace: true})), {no_trace: true});
+    let nextQ = this.criticTarget.activate(experience.nextState.concat(actorTargetActivation), {trace: false});
     let qPrime = [];
     for (let i = 0; i < nextQ.length; i++) {
       qPrime.push(experience.isFinalState
         ? experience.reward
         : experience.reward + this.gamma * nextQ[i]);
     }
 
-    // Learning the actor and critic networks
-    let criticGradients = criticActivation;
-    for (let i = 0; i < criticActivation.length; i++) {
-      criticGradients[i] += this.criticLoss(qPrime[i], criticGradients[i], this.criticLossOptions);
-    }
-
     let criticLearningRate = Math.max(this.learningRateCriticMin, Rate.EXP(this.learningRateCritic, this.timeStep, {gamma: this.learningRateCriticDecay}));
-    this.critic.propagate(criticLearningRate, 0, true, criticGradients);
+    this.critic.propagate(criticLearningRate, 0, true, qPrime);
 
-    let policyLoss = Utils.mean(this.critic.activate(experience.state.concat(actorActivation), {no_trace: true}));
-    actorActivation[Utils.getMaxValueIndex(experience.action)] -= policyLoss;
+    let policyLoss = -Utils.mean(this.critic.activate(experience.state.concat(actorActivation), {trace: false}));
+    actorActivation[Utils.getMaxValueIndex(experience.action)] *= policyLoss;
 
     let actorLearningRate = Math.max(this.learningRateActorMin, Rate.EXP(this.learningRateActor, this.timeStep, {gamma: this.learningRateActorDecay}));
     this.actor.propagate(actorLearningRate, 0, true, actorActivation);
 
     // Learning the actorTarget and criticTarget networks
-    let actorParameters = this.actor.activate(experience.state, {no_trace: true});
+    let actorParameters = this.actor.activate(experience.state, {trace: false});
+    let criticParameters = this.critic.activate(stateActionArr, {trace: false});
     let actorTargetParameters = this.actorTarget.activate(experience.state);
-    let criticParameters = this.critic.activate(stateActionArr, {no_trace: true});
     let criticTargetParameters = this.criticTarget.activate(stateActionArr);
-    for (let i = 0; i < actorParameters.length; i++) {
-      actorTargetParameters[i] *= this.theta * actorParameters[i] + (1 - this.theta);
+
+    for (let i = 0; i < actorTargetParameters.length; i++) {
+      actorTargetParameters[i] = this.theta * actorParameters[i] + (1 - this.theta) * actorTargetParameters[i];
     }
-    for (let i = 0; i < criticParameters.length; i++) {
-      criticTargetParameters[i] *= this.theta * criticParameters[i] + (1 - this.theta);
+    for (let i = 0; i < criticTargetParameters.length; i++) {
+      criticTargetParameters[i] = this.theta * criticParameters[i] + (1 - this.theta) * criticTargetParameters[i];
     }
 
-    //Learning rate of 1 --> copy parameters
+    //Learning rate of 1 --> copy parameter
     this.actorTarget.propagate(1, 0, true, actorTargetParameters);
     this.criticTarget.propagate(1, 0, true, criticTargetParameters);
-
     return policyLoss;
   },
 };