Skip to content

Commit

Permalink
Merge fef8921 into 0874fff
Browse files Browse the repository at this point in the history
  • Loading branch information
raimannma committed Mar 6, 2020
2 parents 0874fff + fef8921 commit 501893c
Show file tree
Hide file tree
Showing 25 changed files with 22,714 additions and 16,779 deletions.
8,803 changes: 5,067 additions & 3,736 deletions dist/carrot.amd.js

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions dist/carrot.amd.min.js

Large diffs are not rendered by default.

8,803 changes: 5,067 additions & 3,736 deletions dist/carrot.commonjs2.js

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions dist/carrot.commonjs2.min.js

Large diffs are not rendered by default.

8,803 changes: 5,067 additions & 3,736 deletions dist/carrot.umd2.js

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions dist/carrot.umd2.min.js

Large diffs are not rendered by default.

8,803 changes: 5,067 additions & 3,736 deletions dist/carrot.window.js

Large diffs are not rendered by default.

12 changes: 7 additions & 5 deletions dist/carrot.window.min.js

Large diffs are not rendered by default.

1,200 changes: 694 additions & 506 deletions package-lock.json

Large diffs are not rendered by default.

15 changes: 8 additions & 7 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -67,23 +67,24 @@
},
"homepage": "https://liquidcarrot.io/",
"dependencies": {
"lodash": "^4.17.15"
"lodash": "^4.17.15",
"riteway": "^6.1.1"
},
"devDependencies": {
"chai": "^4.2.0",
"chai-as-promised": "^7.1.1",
"chai-each": "0.0.1",
"chalk": "^2.4.2",
"copy-webpack-plugin": "^5.0.4",
"coveralls": "^3.0.6",
"copy-webpack-plugin": "^5.1.1",
"coveralls": "^3.0.9",
"faker": "^4.1.0",
"jsdoc": "^3.6.3",
"mocha": "^6.2.0",
"nodemon": "^1.19.2",
"mocha": "^6.2.2",
"nodemon": "^1.19.4",
"nyc": "^13.3.0",
"parallel-webpack": "^2.4.0",
"webpack": "^4.41.0",
"webpack-cli": "^3.3.9"
"webpack": "^4.41.2",
"webpack-cli": "^3.3.10"
},
"nyc": {
"include": [
Expand Down
1 change: 1 addition & 0 deletions src/architecture/connection.js
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ function Connection (from, to, weight, options) {
delta_weights_total: 0,
delta_weights: [],
xtrace_nodes: [],
sharedIncoming: null,
xtrace_values: []
}, options, { from, to, weight});

Expand Down
989 changes: 417 additions & 572 deletions src/architecture/network.js

Large diffs are not rendered by default.

33 changes: 23 additions & 10 deletions src/architecture/node.js
Original file line number Diff line number Diff line change
Expand Up @@ -62,11 +62,12 @@ function Node(options) {
incoming: [],
outgoing: [],
gated: [],
sharedIncoming: null,
connections_self: new Connection(self, self, 0),
error_responsibility: 0,
error_projected: 0,
error_gated: 0,
...options
...options,
})

/**
Expand Down Expand Up @@ -116,6 +117,9 @@ function Node(options) {
// DRY abstraction
const activate = function() {
// Activate (from self)
if (self.sharedIncoming !== null) {
self.bias = self.sharedIncoming.bias;
}
self.state = self.connections_self.gain * self.connections_self.weight * self.state + self.bias;

// Activate (from incoming connections)
Expand All @@ -124,7 +128,7 @@ function Node(options) {
self.state += conn.from.activation * conn.weight * conn.gain;
}

return self.state
return self.state;
}

if(options.trace) {
Expand Down Expand Up @@ -281,7 +285,6 @@ function Node(options) {
self.error_projected = 0;
for (let i = 0; i < self.outgoing.length; i++) {
const connection = self.outgoing[i];

self.error_projected += connection.to.error_responsibility * connection.weight * connection.gain;
}
self.error_projected *= self.derivative || 1;
Expand All @@ -293,7 +296,7 @@ function Node(options) {
const node = connection.to;
const influence = (node.connections_self.gater === self ? node.old : 0) + connection.weight * connection.from.activation;

self.error_gated += node.error_reponsibility * influence;
self.error_gated += node.error_responsibility * influence;
}
self.error_gated *= self.derivative || 1;

Expand All @@ -314,7 +317,9 @@ function Node(options) {
connection.delta_weights_total += options.rate * gradient * self.mask;
if (options.update) {
connection.delta_weights_total += options.momentum * connection.delta_weights_previous;
connection.weight += connection.delta_weights_total;
if (connection.sharedIncoming === null) {
connection.weight += connection.delta_weights_total;
}
connection.delta_weights_previous = connection.delta_weights_total;
connection.delta_weights_total = 0;
}
Expand All @@ -324,7 +329,9 @@ function Node(options) {
self.delta_bias_total += options.rate * self.error_responsibility;
if (options.update) {
self.delta_bias_total += options.momentum * self.delta_bias_previous;
self.bias += self.delta_bias_total;
if (self.sharedIncoming === null) {
self.bias += self.delta_bias_total;
}
self.delta_bias_previous = self.delta_bias_total;
self.delta_bias_total = 0;
}
Expand Down Expand Up @@ -699,11 +706,16 @@ function Node(options) {

switch(options.method) {
case methods.mutation.MOD_ACTIVATION:
if(options.allowed) self.squash = options.allowed[random_index(options.allowed.length, options.allowed.indexOf(self.squash))];
else self.squash = methods.activation[random_key(Object.keys(methods.activation), self.squash.name)]
if (options.allowed) {
self.squash = options.allowed[random_index(options.allowed.length, options.allowed.indexOf(self.squash))];
} else {
self.squash = methods.activation[random_key(Object.keys(methods.activation), self.squash.name)];
}
break;
case methods.mutation.MOD_BIAS:
self.bias += Math.random() * (options.method.max - options.method.min) + options.method.min;
if (self.sharedIncoming === null) {
self.bias += Math.random() * (options.method.max - options.method.min) + options.method.min;
}
break;
}
},
Expand Down Expand Up @@ -850,7 +862,8 @@ function Node(options) {
bias: self.bias,
type: self.type,
squash: self.squash.name,
mask: self.mask
mask: self.mask,
shared: self.shared,
};
}
}
Expand Down
104 changes: 55 additions & 49 deletions src/architecture/rl/ddpg.js
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
const architect = require('../architect');
const Network = require('../network');
const ReplayBuffer = require('./replay-buffer');
const Experience = require('./experience');
const Utils = require('../../util/utils');
const Rate = require('../../methods/rate');
const Loss = require('../../methods/cost');

/**
*
* Creates a DDPG-Agent
*
* Used to do reinforcement learning with an DDPG Agent
*
* @beta
* @alpha
*
* @constructs DDPG
*
Expand Down Expand Up @@ -47,39 +45,39 @@ function DDPG(numStates, numActions, options) {
let hiddenNeuronsActor = Utils.RL.getOption(options, 'hiddenNeuronsActor', [10]);
let hiddenNeuronsCritic = Utils.RL.getOption(options, 'hiddenNeuronsCritic', hiddenNeuronsActor);

this.actor = Utils.RL.getOption(options, 'actor', new architect.Perceptron(numStates, hiddenNeuronsActor, numActions));
this.critic = Utils.RL.getOption(options, 'critic', new architect.Perceptron(numStates + numActions, hiddenNeuronsCritic, numActions));
this.actor = Utils.RL.getOption(options, 'actor', new Network.architecture.Perceptron(numStates, ...hiddenNeuronsActor, numActions));
this.critic = Utils.RL.getOption(options, 'critic', new Network.architecture.Perceptron(numStates + numActions, ...hiddenNeuronsCritic, numActions));
this.actorTarget = Utils.RL.getOption(options, 'actorTarget', Network.fromJSON(this.actor.toJSON()));
this.criticTarget = Utils.RL.getOption(options, 'criticTarget', Network.fromJSON(this.critic.toJSON()));

// Experience ("Memory")
let experienceSize = Utils.RL.getOption(options, 'experienceSize', 50000);
let noisyPER = Utils.RL.getOption(options, 'noisyPER', null);
this.replayBuffer = Utils.RL.getOption(options, 'replayBuffer', noisyPER === null
? new ReplayBuffer(experienceSize)
? new ReplayBuffer(experienceSize, 0)
: new ReplayBuffer(experienceSize, noisyPER));
this.learningStepsPerIteration = Utils.RL.getOption(options, 'learningStepsPerIteration', 20);
this.learningStepsPerIteration = Utils.RL.getOption(options, 'learningStepsPerIteration', 100);
this.startLearningThreshold = Utils.RL.getOption(options, 'startLearningThreshold', 0);

// Training specific variables
this.isContinuousTask = Utils.RL.getOption(options, 'isContinuousTask', false);
this.gamma = Utils.RL.getOption(options, 'gamma', 0.7);
this.theta = Utils.RL.getOption(options, 'theta', 0.01); // soft target update
this.criticLoss = Utils.RL.getOption(options, 'criticLoss', Loss.MSE);
this.criticLossOptions = Utils.RL.getOption(options, 'criticLossOptions', {});
this.isTraining = Utils.RL.getOption(options, 'isTraining', true);
this.isUsingPER = Utils.RL.getOption(options, 'isUsingPER', true); // using prioritized experience replay

this.learningRateActor = Utils.RL.getOption(options, 'learningRateActor', 0.1); // AKA alpha value function learning rate
this.learningRateActorDecay = Utils.RL.getOption(options, 'learningRateActorDecay', 0.99); // AKA alpha value function learning rate
this.learningRateActorMin = Utils.RL.getOption(options, 'learningRateActorMin', 0.01); // AKA alpha value function learning rate
this.learningRateActorMin = Utils.RL.getOption(options, 'learningRateActorMin', 0.005); // AKA alpha value function learning rate

this.learningRateCritic = Utils.RL.getOption(options, 'learningRateCritic', this.learningRateActor); // AKA alpha value function learning rate
this.learningRateCriticDecay = Utils.RL.getOption(options, 'learningRateCriticDecay', this.learningRateActorDecay); // AKA alpha value function learning rate
this.learningRateCriticMin = Utils.RL.getOption(options, 'learningRateCriticMin', this.learningRateActorMin); // AKA alpha value function learning rate
this.learningRateCritic = Utils.RL.getOption(options, 'learningRateCritic', 0.1); // AKA alpha value function learning rate
this.learningRateCriticDecay = Utils.RL.getOption(options, 'learningRateCriticDecay', 0.99); // AKA alpha value function learning rate
this.learningRateCriticMin = Utils.RL.getOption(options, 'learningRateCriticMin', 0.05); // AKA alpha value function learning rate

// Exploration / Exploitation management
this.noiseStandardDeviation = Utils.RL.getOption(options, 'noiseStandardDeviation', 0.3); // AKA epsilon for epsilon-greedy policy
this.noiseStandardDeviationDecay = Utils.RL.getOption(options, 'noiseStandardDeviationDecay', 0.9999); // AKA epsilon for epsilon-greedy policy
this.noiseStandardDeviationMin = Utils.RL.getOption(options, 'noiseStandardDeviationMin', 0.01); // AKA epsilon for epsilon-greedy policy
this.noiseStandardDeviation = Utils.RL.getOption(options, 'noiseStandardDeviation', 0.1); // AKA epsilon for epsilon-greedy policy
this.noiseStandardDeviationDecay = Utils.RL.getOption(options, 'noiseStandardDeviationDecay', 0.99); // AKA epsilon for epsilon-greedy policy
this.noiseStandardDeviationMin = Utils.RL.getOption(options, 'noiseStandardDeviationMin', 0.05); // AKA epsilon for epsilon-greedy policy

this.timeStep = 0;
this.actions = [];
Expand Down Expand Up @@ -191,16 +189,29 @@ DDPG.prototype = {
* @memberof DDPG
*
* @param {number[]} state current state (float arr with values from [0,1])
* @return {int} The action which the DQN would take at this state; action ∈ [0, this.numActions-1]
* @param {number[]} prohibitedActions all prohibited actions at this state
* @return {int|number[]} The action which the DQN would take at this state; action ∈ [0, this.numActions-1] or the complete action array with the QValues for continuous tasks
*/
act: function(state) {
act: function(state, prohibitedActions) {
if (prohibitedActions === undefined) {
prohibitedActions = [];
}
let noiseFactor = Math.max(this.noiseStandardDeviationMin, Rate.EXP(this.noiseStandardDeviation, this.timeStep, {gamma: this.noiseStandardDeviationDecay}));
let action = Utils.addGaussianNoiseToNetwork(this.actor, noiseFactor).activate(state);

if (this.startLearningThreshold > this.timeStep) {
for (let i = 0; i < action.length; i++) {
action[i] = Math.random();
}
}

for (let i = 0; i < prohibitedActions.length; i++) {
action[prohibitedActions[i]] = -1;
}
this.actions = action;
this.lastState = this.state;
this.state = state;

return Utils.getMaxValueIndex(action);
return this.isContinuousTask ? action : Utils.getMaxValueIndex(action);
},

/**
Expand All @@ -214,27 +225,23 @@ DDPG.prototype = {
* @returns {number} the loss value; loss ∈ [-1,1]
*/
learn: function(newReward, isFinalState = false) {
// Normalizing newReward:
// newReward ∈ [-1,1] --> normalizedReward ∈ [0,1]
const normalizedReward = (1 + newReward) / 2;

this.timeStep++;
if (this.timeStep === 1 || !this.isTraining) {
if (this.timeStep === 1 || !this.isTraining || this.startLearningThreshold > this.timeStep) {
this.lastReward = newReward;
return 1;
}
let experience = new Experience(this.lastState, this.actions, normalizedReward, this.state, 0, isFinalState);
let experience = new Experience(this.lastState, this.actions, this.lastReward, this.state, 0, isFinalState);
experience.loss = this.study(experience);
this.replayBuffer.add(experience);

let loss = this.study(experience);

let miniBatch = this.isUsingPER
? this.replayBuffer.getMiniBatchWithPER(this.learningStepsPerIteration)
: this.replayBuffer.getRandomMiniBatch(this.learningStepsPerIteration);

for (let i = 0; i < miniBatch.length; i++) {
this.study(miniBatch[i]);
}
return loss;
this.lastReward = newReward;
return experience.loss;
},

/**
Expand All @@ -247,49 +254,48 @@ DDPG.prototype = {
* @returns {number} Actor loss value; loss ∈ [-1,1]
*/
study: function(experience) {
if (experience.state === undefined || experience.state === null ||
experience.action === undefined || experience.action === null) {
return 0;
}
let stateActionArr = experience.state.concat(experience.action);
let criticActivation = this.critic.activate(stateActionArr);
this.critic.activate(stateActionArr);
let actorActivation = this.actor.activate(experience.state);
let actorTargetActivation = this.actorTarget.activate(experience.nextState, {trace: false});

let nextQ = this.criticTarget.activate(experience.nextState.concat(this.actorTarget.activate(experience.nextState, {no_trace: true})), {no_trace: true});
let nextQ = this.criticTarget.activate(experience.nextState.concat(actorTargetActivation), {trace: false});
let qPrime = [];
for (let i = 0; i < nextQ.length; i++) {
qPrime.push(experience.isFinalState
? experience.reward
: experience.reward + this.gamma * nextQ[i]);
}

// Learning the actor and critic networks
let criticGradients = criticActivation;
for (let i = 0; i < criticActivation.length; i++) {
criticGradients[i] += this.criticLoss(qPrime[i], criticGradients[i], this.criticLossOptions);
}

let criticLearningRate = Math.max(this.learningRateCriticMin, Rate.EXP(this.learningRateCritic, this.timeStep, {gamma: this.learningRateCriticDecay}));
this.critic.propagate(criticLearningRate, 0, true, criticGradients);
this.critic.propagate(criticLearningRate, 0, true, qPrime);

let policyLoss = Utils.mean(this.critic.activate(experience.state.concat(actorActivation), {no_trace: true}));
actorActivation[Utils.getMaxValueIndex(experience.action)] -= policyLoss;
let policyLoss = -Utils.mean(this.critic.activate(experience.state.concat(actorActivation), {trace: false}));
actorActivation[Utils.getMaxValueIndex(experience.action)] *= policyLoss;

let actorLearningRate = Math.max(this.learningRateActorMin, Rate.EXP(this.learningRateActor, this.timeStep, {gamma: this.learningRateActorDecay}));
this.actor.propagate(actorLearningRate, 0, true, actorActivation);

// Learning the actorTarget and criticTarget networks
let actorParameters = this.actor.activate(experience.state, {no_trace: true});
let actorParameters = this.actor.activate(experience.state, {trace: false});
let criticParameters = this.critic.activate(stateActionArr, {trace: false});
let actorTargetParameters = this.actorTarget.activate(experience.state);
let criticParameters = this.critic.activate(stateActionArr, {no_trace: true});
let criticTargetParameters = this.criticTarget.activate(stateActionArr);
for (let i = 0; i < actorParameters.length; i++) {
actorTargetParameters[i] *= this.theta * actorParameters[i] + (1 - this.theta);

for (let i = 0; i < actorTargetParameters.length; i++) {
actorTargetParameters[i] = this.theta * actorParameters[i] + (1 - this.theta) * actorTargetParameters[i];
}
for (let i = 0; i < criticParameters.length; i++) {
criticTargetParameters[i] *= this.theta * criticParameters[i] + (1 - this.theta);
for (let i = 0; i < criticTargetParameters.length; i++) {
criticTargetParameters[i] = this.theta * criticParameters[i] + (1 - this.theta) * criticTargetParameters[i];
}

//Learning rate of 1 --> copy parameters
//Learning rate of 1 --> copy parameter
this.actorTarget.propagate(1, 0, true, actorTargetParameters);
this.criticTarget.propagate(1, 0, true, criticTargetParameters);

return policyLoss;
},
};
Expand Down
Loading

0 comments on commit 501893c

Please sign in to comment.