Skip to content

Commit

Permalink
Merge e842c94 into 2d4a793
Browse files Browse the repository at this point in the history
  • Loading branch information
raimannma committed Nov 6, 2019
2 parents 2d4a793 + e842c94 commit 37a4aed
Show file tree
Hide file tree
Showing 7 changed files with 233 additions and 180 deletions.
179 changes: 85 additions & 94 deletions src/architecture/rl/dqn.js
Original file line number Diff line number Diff line change
@@ -1,74 +1,74 @@
const architect = require('../architect');
const Network = require('../network');
const Window = require("../../util/window");
const ReplayBuffer = require('./replay-buffer');
const Experience = require('./experience');
const Rate = require("../../methods/rate");


/**
* This function will get the value from the fieldName, if Present, otherwise returns the defaultValue
* @param {Object} opt
* @param {String} fieldName
* @param {number|boolean} defaultValue
* @return {Number | number[]} the value of the fileName if Present, otherwise the defaultValue
* @param {number | boolean | number[]} defaultValue
* @return {Number | number[] | boolean} the value of the fileName if Present, otherwise the defaultValue
*/
function getopt(opt, fieldName, defaultValue) {
function getOption(opt, fieldName, defaultValue) {
if (typeof opt === 'undefined') {
return defaultValue;
}
return (typeof opt[fieldName] !== 'undefined') ? opt[fieldName] : defaultValue;
}

/**
* Creates a DQN network
*
* Used to do reinforcement learning
*
* @alpha
*
* @constructs DQN
*
* @param {int} numActions Maximum number of actions the agent can do,
* @param {int} numStates Length of the state array
* @param {Object} options Options object
*
* @todo Allow underlying Network to have arbitrary layer structure
* @todo Add test & custom network input / output size validation
* Creates a DQN network
*
* Used to do reinforcement learning
*
* @alpha
*
* @constructs DQN
*
* @param {int} numActions Maximum number of actions the agent can do,
* @param {int} numStates Length of the state array
* @param {Object} options Options object
*
* @todo Allow underlying Network to have arbitrary layer structure
* @todo Add test & custom network input / output size validation
* @todo Maybe automatically suggest default values for the num of states and actions
*/
function DQN(numActions, numStates, opt) {
function DQN(numStates, numActions, options) {
// Network Sizing
this.numStates = numStates;
this.numActions = numActions;
this.hiddenNeurons = getopt(opt, 'hiddenNeurons', [10]);
this.network = getopt(opt, 'network', new architect.Perceptron(numStates, ...this.hiddenNeurons, numActions));
this.hiddenNeurons = getOption(options, 'hiddenNeurons', [10]);
this.network = getOption(options, 'network', new architect.Perceptron(numStates, ...this.hiddenNeurons, numActions));

// Network & state memory
this.reward = null;
this.state = null;
this.nextState = null;
this.action = null;
this.nextAction = null;

// Learning and update
this.learningRate = getopt(opt, 'learningRate', 0.1); // AKA alpha value function learning rate
this.learningRateDecay = getopt(opt, 'learningRateDecay', 0.99); // AKA alpha value function learning rate
this.learningRateMin = getopt(opt, 'learningRateMin', 0.01); // AKA alpha value function learning rate
this.learningRate = getOption(options, 'learningRate', 0.1); // AKA alpha value function learning rate
this.learningRateDecay = getOption(options, 'learningRateDecay', 0.99); // AKA alpha value function learning rate
this.learningRateMin = getOption(options, 'learningRateMin', 0.01); // AKA alpha value function learning rate
this.loss = 0;
this.tderrorClamp = getopt(opt, 'tderrorClamp', 1);
this.isTraining = getopt(opt, 'isTraining', true);
this.tderrorClamp = getOption(options, 'tderrorClamp', 1);
this.isTraining = getOption(options, 'isTraining', true);

// Experience Replay
let experienceSize = getopt(opt, 'experience_size', 50000); // size of experience replay
this.experience = new Window(experienceSize, true); // experience
this.learningStepsPerIteration = getopt(opt, 'learning_steps_per_iteration', 20); // number of time steps before we add another experience to replay memory
let experienceSize = getOption(options, 'experience_size', 50000); // size of experience replay
this.experience = new ReplayBuffer(experienceSize); // experience
this.learningStepsPerIteration = getOption(options, 'learning_steps_per_iteration', 20); // number of time steps before we add another experience to replay memory
this.timeStep = 0;

// Exploration / Exploitation management
this.explore = getopt(opt, 'explore', 0.05); // AKA epsilon for epsilon-greedy policy
this.exploreDecay = getopt(opt, 'exploreDecay', 0.99); // AKA epsilon for epsilon-greedy policy
this.exploreMin = getopt(opt, 'exploreMin', 0); // AKA epsilon for epsilon-greedy policy
this.explore = getOption(options, 'explore', 0.3); // AKA epsilon for epsilon-greedy policy
this.exploreDecay = getOption(options, 'exploreDecay', 0.9999); // AKA epsilon for epsilon-greedy policy
this.exploreMin = getOption(options, 'exploreMin', 0.01); // AKA epsilon for epsilon-greedy policy

// Reward calculation
this.gamma = getopt(opt, 'gamma', 0.1); // future reward discount factor
this.gamma = getOption(options, 'gamma', 0.7); // future reward discount factor
}

DQN.prototype = {
Expand All @@ -78,7 +78,7 @@ DQN.prototype = {
* @function toJSON
* @memberof DQN
*
* @return {JSON} JSON String which represents the current DQN agent
* @return {{net:{input:{number},output:{number},dropout:{number},nodes:Array<object>,connections:Array<object>},gamma:{number},explore:{number},exploreDecay:{number},exploreMin:{number},learningRate:{number},learningRateDecay:{number},learningRateMin:{number},isTraining:{boolean},experience:{ReplayBuffer}}} json JSON String JSON String which represents this DQN agent
*/
toJSON: function () {
let json = {};
Expand Down Expand Up @@ -109,16 +109,16 @@ DQN.prototype = {
* @memberof DQN
*
* @param {number[]} state current state (float arr with values between 0 and 1)
* @returns {number} The action which the DQN would take at this state (represented by an index)
* @return {number} The action which the DQN would take at this state (represented by an index)
*
* @todo Add ability to select strategies
* @todo Add Thompson Sampling strategy
*/
act: function (state) {
// epsilon greedy strategy | explore > random = explore; else exploit
// epsilon greedy strategy | explore > random ? explore : otherwise exploit
const action = (Math.max(this.exploreMin, Rate.EXP(this.explore, this.timeStep, {gamma: this.exploreDecay})) > Math.random())
? Math.floor(Math.random() * this.numActions) // random "explore" action
: this.getMaxValueIndex(this.network.activate(state)) // deliberate "exploit" action
: DQN.getMaxValueIndex(this.network.activate(state)); // deliberate "exploit" action

// shift state memory
this.state = this.nextState;
Expand All @@ -136,22 +136,27 @@ DQN.prototype = {
* @memberof DQN
*
* @param {number} newReward the current reward, the agent receives from the environment
* @param {boolean} isFinalState Does the game ends at this state?
* @returns {number} the loss value
*
* @todo Add prioritized experience replay
* @todo Add hindsight experience replay
*/
learn: function (newReward) {
learn: function(newReward, isFinalState = false) {
// Update Q function | temporal difference method currently hardcoded
if (this.reward != null && this.isTraining) {
let experience = new Experience(this.state, this.action, this.reward, this.nextState, isFinalState);
// Learn from current estimated reward to understand how wrong agent is
this.loss = this.study(this.state, this.action, this.reward, this.nextState);
let loss = this.study(experience);
experience.loss = loss;
this.loss = loss;

// Too random, should pick experiences by their loss value
this.experience.add([this.state, this.action, this.reward, this.nextState, this.loss]);
this.experience.add(experience);

for (let i = 0; i < this.learningStepsPerIteration; i++) {
this.study(...this.experience.pickRandom());
let miniBatch = this.experience.getRandomMiniBatch(this.learningStepsPerIteration);
for (let i = 0; i < miniBatch.length; i++) {
this.study(miniBatch[i]);
}
}
this.timeStep++;
Expand All @@ -165,74 +170,39 @@ DQN.prototype = {
* @function study
* @memberof DQN
*
* @param {number[]} state current state
* @param {number} action action taken in current state
* @param {number} reward reward received for the action in the current state
* @param {number[]} nextState the state which follows the current state with the action taken
* @param {Experience} experience the experience to learn from
* @returns {number} TDError Roughly, an experiential measure of surprise / insight for the network at this state-action.
*
* @todo Add dynamic loss functions & clamps, including Huber Loss
* @todo Add target network to increase reliability
* @todo Consider not using a target network: https://www.ijcai.org/proceedings/2019/0379.pdf
*/
study: function (state, action, reward, nextState) {
study: function(experience) {
// Compute target Q value, called without traces so it won't affect backprop
const nextActions = this.network.activate(nextState, {no_trace: true});
const nextActions = this.network.activate(experience.nextState, {no_trace: true});

// Q(s,a) = r + gamma * max_a' Q(s',a')
const targetQValue = (1 + reward) / 2 + this.gamma * nextActions[this.getMaxValueIndex(nextActions)];
let normalizedReward = (1 + experience.reward) / 2;
let targetQValue;
targetQValue = experience.isFinalState
? normalizedReward // For the final state only the current reward is important
: normalizedReward + this.gamma * nextActions[DQN.getMaxValueIndex(nextActions)];

// Predicted current reward | called with traces for backprop later
const predictedReward = this.network.activate(state);
const predictedReward = this.network.activate(experience.state);

let tdError = predictedReward[action] - targetQValue;
let tdError = predictedReward[experience.action] - targetQValue;

// Clamp error for robustness | ToDo: huber loss
// Clamp error for robustness
if (Math.abs(tdError) > this.tderrorClamp) {
tdError = tdError > this.tderrorClamp ? this.tderrorClamp : -this.tderrorClamp;
}

// Backpropagation using temporal difference error
predictedReward[action] = targetQValue;
predictedReward[experience.action] = targetQValue;
this.network.propagate(Math.max(this.learningRateMin, Rate.EXP(this.learningRate, this.timeStep, {gamma: this.learningRateDecay})), 0, true, predictedReward);
return tdError;
},

/**
* This method returns the index of the element with the highest value
*
* @function getMaxValueIndex
* @memberof DQN
*
* @param {number[]} arr the input array
* @returns {number} the index which the highest value
*
* @todo Create unit test
*/
getMaxValueIndex: function (arr) {
let index = 0;
let maxValue = arr[0];
for (let i = 1; i < arr.length; i++) {
if (arr[i] > maxValue) {
maxValue = arr[i];
index = i;
}
}
return index;
},

/**
* Setter for variable "isTraining"
*
* @function setTraining
* @memberof DQN
*
* @param val new value
* @todo Consider removing
*/
setTraining: function (val) {
this.isTraining = val;
}
};

/**
Expand All @@ -241,12 +211,12 @@ DQN.prototype = {
* @function fromJSON
* @memberof DQN
*
* @param {JSON} json JSON String
* @param {{net:{input:{number},output:{number},dropout:{number},nodes:Array<object>,connections:Array<object>},gamma:{number},explore:{number},exploreDecay:{number},exploreMin:{number},learningRate:{number},learningRateDecay:{number},learningRateMin:{number},isTraining:{boolean},experience:{Window}}} json JSON String
* @return {DQN} Agent with the specs from the json
*/
DQN.fromJSON = function (json) {
let network = Network.fromJSON(json);
let agent = new DQN(network.input_size, network.output_size, {});
let network = Network.fromJSON(json.net);
let agent = new DQN(network.input_size, network.output_size, {network: network});

agent.gamma = json.gamma;
agent.explore = json.explore;
Expand All @@ -261,4 +231,25 @@ DQN.fromJSON = function (json) {
return agent;
};

/**
* This method returns the index of the element with the highest value
*
* @function getMaxValueIndex
* @memberof DQN
*
* @param {number[]} arr the input array
* @returns {number} the index which the highest value
*/
DQN.getMaxValueIndex = function(arr) {
let index = 0;
let maxValue = arr[0];
for (let i = 1; i < arr.length; i++) {
if (arr[i] > maxValue) {
maxValue = arr[i];
index = i;
}
}
return index;
};

module.exports = DQN;
20 changes: 20 additions & 0 deletions src/architecture/rl/experience.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/**
* Creates an experience object
*
* @param state the current state
* @param action the current action
* @param reward the reward for the current action in the current state
* @param nextState the state following by the current action in the current state
* @param isFinalState Does the game ends at this state?
* @constructor
*/
function Experience(state, action, reward, nextState, isFinalState) {
this.state = state;
this.action = action;
this.reward = reward;
this.nextState = nextState;
this.isFinalState = isFinalState;
this.loss = 0;
}

module.exports = Experience;
52 changes: 52 additions & 0 deletions src/architecture/rl/replay-buffer.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
const Experience = require('./experience');

/**
* Creates a replay buffer with a maximum size of experience entries.
*
* @param maxSize maximum number of experiences
* @constructor
*/
function ReplayBuffer(maxSize) {
this.buffer = [];
this.maxSize = maxSize;
}

ReplayBuffer.prototype = {
/**
* Adds an experience entry to the buffer.
*
* @param {Experience} experience the experience to add
*/
add: function(experience) {
if (this.buffer.length >= this.maxSize) {
this.buffer.shift();
}
this.buffer.push(experience);
},

/**
* Get a random mini batch of given size.
*
* @param {number} size the size of the minibatch.
*
* @returns {Experience[]} a batch of Experiences to train from.
*/
getRandomMiniBatch: function(size) {
//Size can't be bigger than this.buffer.length
size = Math.min(size, this.buffer.length);
if (size === this.buffer.length) {
return this.buffer;
}

let bufferCopy = [...this.buffer];
let batch = [];

for (let i = 0; i < size; i++) {
//Add an random experience to the batch and remove it from the bufferCopy
batch.push(bufferCopy.splice(Math.floor(Math.random() * bufferCopy.length), 1)[0]);
}
return batch;
},
};

module.exports = ReplayBuffer;
Loading

0 comments on commit 37a4aed

Please sign in to comment.