Skip to content

Commit

Permalink
added FunctionGradient utility methods and changed approach to comput…
Browse files Browse the repository at this point in the history
…ing softmax gradient to a more numerically stable version
  • Loading branch information
jmacglashan committed Sep 20, 2016
1 parent 07ea68e commit cca5b76
Show file tree
Hide file tree
Showing 3 changed files with 179 additions and 34 deletions.
136 changes: 136 additions & 0 deletions src/main/java/burlap/behavior/functionapproximation/GradientUtils.java
@@ -0,0 +1,136 @@
package burlap.behavior.functionapproximation;

import burlap.datastructures.HashedAggregator;

import java.util.HashSet;
import java.util.Map;
import java.util.Set;

/**
* @author James MacGlashan
*/
public class GradientUtils {


/**
* Turns a {@link HashedAggregator} of index type Integer, and turns it into
* {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient}, where
* the keys are parameter indices and the values their partial derivative.
* @param summedParams the {@link HashedAggregator} to transform
* @return a {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient}
*/
public static FunctionGradient toGradient(HashedAggregator<Integer> summedParams){
FunctionGradient fg = new FunctionGradient.SparseGradient(summedParams.size());
for(Map.Entry<Integer, Double> e : summedParams.entrySet()){
fg.put(e.getKey(), e.getValue());
}
return fg;
}


/**
* Multiplies every element in a {@link FunctionGradient} by scalar
* @param fg the {@link FunctionGradient}
* @param scalar the scalar value
*/
public static void scalarMult(FunctionGradient fg, double scalar){
for(FunctionGradient.PartialDerivative pd : fg.getNonZeroPartialDerivatives()){
double scaled = pd.value * scalar;
fg.put(pd.parameterId, scaled);
}
}


/**
* Creates a {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient} that is set
* to a gradient multiplied by a scMultiplies every element in a {@link FunctionGradient} by scalar
* @param fg the {@link FunctionGradient}
* @param scalar the scalar value
*/
public static FunctionGradient scalarMultCopy(FunctionGradient fg, double scalar){
FunctionGradient cfg = new FunctionGradient.SparseGradient(fg.numNonZeroPDs());
for(FunctionGradient.PartialDerivative pd : fg.getNonZeroPartialDerivatives()){
double scaled = pd.value * scalar;
cfg.put(pd.parameterId, scaled);
}
return cfg;
}


/**
* Adds the partial derivatives from a gradient into a {@link HashedAggregator}
* @param fg the source gradient
* @param sum the destination to which the partial derivatives are added
*/
public static void sumInto(FunctionGradient fg, HashedAggregator<Integer> sum){
for(FunctionGradient.PartialDerivative pd : fg.getNonZeroPartialDerivatives()){
sum.add(pd.parameterId, pd.value);
}
}

/**
* Returns a-b in a new {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient}
* @param a the first gradient
* @param b the second gradient
* @return a-b as a {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient}
*/
public static FunctionGradient diffGrad(FunctionGradient a, FunctionGradient b){
Set<Integer> pIds = pdIdSet(a, b);

//now compute
FunctionGradient fg = new FunctionGradient.SparseGradient(pIds.size());
for(int pid : pIds){
double v = a.getPartialDerivative(pid) - b.getPartialDerivative(pid);
fg.put(pid, v);
}

return fg;

}


/**
* return a+b in a new {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient}
* @param a the first gradient
* @param b the second gradient
* @return a new {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient}
*/
public static FunctionGradient addGrad(FunctionGradient a, FunctionGradient b){

Set<Integer> pIds = pdIdSet(a, b);

//now compute
FunctionGradient fg = new FunctionGradient.SparseGradient(pIds.size());
for(int pid : pIds){
double v = a.getPartialDerivative(pid) + b.getPartialDerivative(pid);
fg.put(pid, v);
}

return fg;

}


/**
* Returns the set of parameter ids with non-zero partial derivatives across two gradients.
*
* That is, nonZero(a) U nonZero(b)
* @param a the first gradient
* @param b the second gradient
* @return a set of the partial derivative ids
*/
public static Set<Integer> pdIdSet(FunctionGradient a, FunctionGradient b){
Set<FunctionGradient.PartialDerivative> aSet = a.getNonZeroPartialDerivatives();
Set<FunctionGradient.PartialDerivative> bSet = b.getNonZeroPartialDerivatives();
Set<Integer> pIds = new HashSet<Integer>(aSet.size()+bSet.size());
for(FunctionGradient.PartialDerivative pd : aSet){
pIds.add(pd.parameterId);
}
for(FunctionGradient.PartialDerivative pd : bSet){
pIds.add(pd.parameterId);
}

return pIds;
}

}
Expand Up @@ -28,7 +28,7 @@ public FunctionGradient gradient(double [] qs, FunctionGradient[] qGradients) {
for(int i = 0; i < qs.length; i++){

double probA = Math.exp(this.beta * qs[i] - logSum);
FunctionGradient policyGradient = BoltzmannPolicyGradient.computePolicyGradient(this.beta, qs, maxBetaScaled, logSum, qGradients, i);
FunctionGradient policyGradient = BoltzmannPolicyGradient.computePolicyGradient(qs, qGradients, i, this.beta);

for(FunctionGradient.PartialDerivative pd : policyGradient.getNonZeroPartialDerivatives()){
double curVal = vGradient.getPartialDerivative(pd.parameterId);
Expand Down
@@ -1,8 +1,11 @@
package burlap.behavior.singleagent.learnfromdemo.mlirl.support;

import burlap.behavior.functionapproximation.FunctionGradient;
import burlap.behavior.functionapproximation.GradientUtils;
import burlap.behavior.valuefunction.QProvider;
import burlap.behavior.valuefunction.QValue;
import burlap.datastructures.BoltzmannDistribution;
import burlap.datastructures.HashedAggregator;
import burlap.mdp.core.action.Action;
import burlap.mdp.core.state.State;

Expand Down Expand Up @@ -60,49 +63,55 @@ public static FunctionGradient computeBoltzmannPolicyGradient(State s, Action a,
}


double maxBetaScaled = maxBetaScaled(qs, beta);
double logSum = logSum(qs, maxBetaScaled, beta);

FunctionGradient policyGradient = computePolicyGradient(beta, qs, maxBetaScaled, logSum, qGradients, aind);
FunctionGradient policyGradient = computePolicyGradient(qs, qGradients, aind, beta);

return policyGradient;

}

/**
* Computes the gradient of a Boltzmann policy using values derived from a Differentiable Botlzmann backup valueFunction.
* @param beta the Boltzmann beta parameter. This parameter is the inverse of the Botlzmann temperature. As beta becomes larger, the policy becomes more deterministic. Should lie in [0, +ifnty].
* @param qs an array holding the Q-value for each action.
* @param maxBetaScaled the maximum Q-value after being scaled by the parameter beta
* @param logSum the log sum of the exponentiated q values
* @param gqs a matrix holding the Q-value gradient for each action. The matrix's major order is the action index, followed by the parameter gradient
* @param aInd the index of the query action for which the policy's gradient is being computed
* @return the gradient of the policy.
*/
public static FunctionGradient computePolicyGradient(double beta, double [] qs, double maxBetaScaled, double logSum, FunctionGradient [] gqs, int aInd){
/**
* Computes the gradient of the Boltzmann (softmax) policy wrt some parameters.
* @param prefs the action-wise preference values passed through the softmax
* @param grads the gradients of the preference-values with respect to the parameters
* @param aind the index of the action for which the gradient is being queried
* @param beta the softmax beta parameter. This parameter is the inverse of the Botlzmann temperature. As beta becomes larger, the policy becomes more deterministic. Should lie in [0, +ifnty].
* @return the gradient of the policy
*/
public static FunctionGradient computePolicyGradient(double [] prefs, FunctionGradient[] grads, int aind, double beta){

FunctionGradient pg = new FunctionGradient.SparseGradient();
double constantPart = beta * Math.exp(beta*qs[aInd] + maxBetaScaled - logSum - logSum);
Set<Integer> nzPDs = combinedNonZeroPDParameters(gqs);
for(int i = 0; i < qs.length; i++){
for(int param : nzPDs){
double curVal = pg.getPartialDerivative(param);
double nextVal = curVal + (gqs[aInd].getPartialDerivative(param) - gqs[i].getPartialDerivative(param))
* Math.exp(beta * qs[i] - maxBetaScaled);
//first compute policy probs
BoltzmannDistribution bd = new BoltzmannDistribution(prefs, 1./beta);
double [] probs = bd.getProbabilities();

pg.put(param, nextVal);
}
}
return computePolicyGradient(probs, prefs, grads, aind, beta);

}

public static FunctionGradient computePolicyGradient(double [] probs, double [] prefs, FunctionGradient[] grads, int aind, double beta){

HashedAggregator<Integer> sums = new HashedAggregator<Integer>();

//now get component for on action gradient
FunctionGradient aterm = GradientUtils.scalarMultCopy(grads[aind], beta * (1. - probs[aind]));
GradientUtils.sumInto(aterm, sums);

//now sum over off action gradients
for(int i = 0; i < prefs.length; i++){
if(i == aind) continue;

FunctionGradient offTerm = GradientUtils.scalarMultCopy(grads[i], -beta * probs[i]);
GradientUtils.sumInto(offTerm, sums);
}

FunctionGradient unnormalized = GradientUtils.toGradient(sums);
FunctionGradient grad = GradientUtils.scalarMultCopy(unnormalized, probs[aind]);

return grad;

}

FunctionGradient finalGradient = new FunctionGradient.SparseGradient(pg.numNonZeroPDs());
for(FunctionGradient.PartialDerivative pd : pg.getNonZeroPartialDerivatives()){
double nextVal = pd.value * constantPart;
finalGradient.put(pd.parameterId, nextVal);
}

return finalGradient;

}


/**
Expand Down

0 comments on commit cca5b76

Please sign in to comment.