added FunctionGradient utility methods and changed approach to comput…

…ing softmax gradient to a more numerically stable version
jmacglashan · Sep 20, 2016 · cca5b76 · cca5b76
1 parent 07ea68e
commit cca5b76
Show file tree

Hide file tree

Showing 3 changed files with 179 additions and 34 deletions.
diff --git a/src/main/java/burlap/behavior/functionapproximation/GradientUtils.java b/src/main/java/burlap/behavior/functionapproximation/GradientUtils.java
@@ -0,0 +1,136 @@
+package burlap.behavior.functionapproximation;
+
+import burlap.datastructures.HashedAggregator;
+
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * @author James MacGlashan
+ */
+public class GradientUtils {
+
+
+    /**
+     * Turns a {@link HashedAggregator} of index type Integer, and turns it into
+     * {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient}, where
+     * the keys are parameter indices and the values their partial derivative.
+     * @param summedParams the {@link HashedAggregator} to transform
+     * @return a {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient}
+     */
+    public static FunctionGradient toGradient(HashedAggregator<Integer> summedParams){
+        FunctionGradient fg = new FunctionGradient.SparseGradient(summedParams.size());
+        for(Map.Entry<Integer, Double> e : summedParams.entrySet()){
+            fg.put(e.getKey(), e.getValue());
+        }
+        return fg;
+    }
+
+
+    /**
+     * Multiplies every element in a {@link FunctionGradient} by scalar
+     * @param fg the {@link FunctionGradient}
+     * @param scalar the scalar value
+     */
+    public static void scalarMult(FunctionGradient fg, double scalar){
+        for(FunctionGradient.PartialDerivative pd : fg.getNonZeroPartialDerivatives()){
+            double scaled = pd.value * scalar;
+            fg.put(pd.parameterId, scaled);
+        }
+    }
+
+
+    /**
+     * Creates a {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient} that is set
+     * to a gradient multiplied by a scMultiplies every element in a {@link FunctionGradient} by scalar
+     * @param fg the {@link FunctionGradient}
+     * @param scalar the scalar value
+     */
+    public static FunctionGradient scalarMultCopy(FunctionGradient fg, double scalar){
+        FunctionGradient cfg = new FunctionGradient.SparseGradient(fg.numNonZeroPDs());
+        for(FunctionGradient.PartialDerivative pd : fg.getNonZeroPartialDerivatives()){
+            double scaled = pd.value * scalar;
+            cfg.put(pd.parameterId, scaled);
+        }
+        return cfg;
+    }
+
+
+    /**
+     * Adds the partial derivatives from a gradient into a {@link HashedAggregator}
+     * @param fg the source gradient
+     * @param sum the destination to which the partial derivatives are added
+     */
+    public static void sumInto(FunctionGradient fg, HashedAggregator<Integer> sum){
+        for(FunctionGradient.PartialDerivative pd : fg.getNonZeroPartialDerivatives()){
+            sum.add(pd.parameterId, pd.value);
+        }
+    }
+
+    /**
+     * Returns a-b in a new {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient}
+     * @param a the first gradient
+     * @param b the second gradient
+     * @return a-b as a {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient}
+     */
+    public static FunctionGradient diffGrad(FunctionGradient a, FunctionGradient b){
+        Set<Integer> pIds = pdIdSet(a, b);
+
+        //now compute
+        FunctionGradient fg = new FunctionGradient.SparseGradient(pIds.size());
+        for(int pid : pIds){
+            double v = a.getPartialDerivative(pid) - b.getPartialDerivative(pid);
+            fg.put(pid, v);
+        }
+
+        return fg;
+
+    }
+
+
+    /**
+     * return a+b in a new {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient}
+     * @param a the first gradient
+     * @param b the second gradient
+     * @return a new {@link burlap.behavior.functionapproximation.FunctionGradient.SparseGradient}
+     */
+    public static FunctionGradient addGrad(FunctionGradient a, FunctionGradient b){
+
+        Set<Integer> pIds = pdIdSet(a, b);
+
+        //now compute
+        FunctionGradient fg = new FunctionGradient.SparseGradient(pIds.size());
+        for(int pid : pIds){
+            double v = a.getPartialDerivative(pid) + b.getPartialDerivative(pid);
+            fg.put(pid, v);
+        }
+
+        return fg;
+
+    }
+
+
+    /**
+     * Returns the set of parameter ids with non-zero partial derivatives across two gradients.
+     *
+     * That is, nonZero(a) U nonZero(b)
+     * @param a the first gradient
+     * @param b the second gradient
+     * @return a set of the partial derivative ids
+     */
+    public static Set<Integer> pdIdSet(FunctionGradient a, FunctionGradient b){
+        Set<FunctionGradient.PartialDerivative> aSet = a.getNonZeroPartialDerivatives();
+        Set<FunctionGradient.PartialDerivative> bSet = b.getNonZeroPartialDerivatives();
+        Set<Integer> pIds = new HashSet<Integer>(aSet.size()+bSet.size());
+        for(FunctionGradient.PartialDerivative pd : aSet){
+            pIds.add(pd.parameterId);
+        }
+        for(FunctionGradient.PartialDerivative pd : bSet){
+            pIds.add(pd.parameterId);
+        }
+
+        return pIds;
+    }
+
+}
diff --git a/.../learnfromdemo/mlirl/differentiableplanners/dpoperator/DifferentiableSoftmaxOperator.java b/.../learnfromdemo/mlirl/differentiableplanners/dpoperator/DifferentiableSoftmaxOperator.java
@@ -28,7 +28,7 @@ public FunctionGradient gradient(double [] qs, FunctionGradient[] qGradients) {
 		for(int i = 0; i < qs.length; i++){
 
 			double probA = Math.exp(this.beta * qs[i] - logSum);
-			FunctionGradient policyGradient = BoltzmannPolicyGradient.computePolicyGradient(this.beta, qs, maxBetaScaled, logSum, qGradients, i);
+			FunctionGradient policyGradient = BoltzmannPolicyGradient.computePolicyGradient(qs, qGradients, i, this.beta);
 
 			for(FunctionGradient.PartialDerivative pd : policyGradient.getNonZeroPartialDerivatives()){
 				double curVal = vGradient.getPartialDerivative(pd.parameterId);

diff --git a/...java/burlap/behavior/singleagent/learnfromdemo/mlirl/support/BoltzmannPolicyGradient.java b/...java/burlap/behavior/singleagent/learnfromdemo/mlirl/support/BoltzmannPolicyGradient.java
@@ -1,8 +1,11 @@
 package burlap.behavior.singleagent.learnfromdemo.mlirl.support;
 
 import burlap.behavior.functionapproximation.FunctionGradient;
+import burlap.behavior.functionapproximation.GradientUtils;
 import burlap.behavior.valuefunction.QProvider;
 import burlap.behavior.valuefunction.QValue;
+import burlap.datastructures.BoltzmannDistribution;
+import burlap.datastructures.HashedAggregator;
 import burlap.mdp.core.action.Action;
 import burlap.mdp.core.state.State;
 
@@ -60,49 +63,55 @@ public static FunctionGradient computeBoltzmannPolicyGradient(State s, Action a,
 		}
 
 
-		double maxBetaScaled = maxBetaScaled(qs, beta);
-		double logSum = logSum(qs, maxBetaScaled, beta);
-
-		FunctionGradient policyGradient = computePolicyGradient(beta, qs, maxBetaScaled, logSum, qGradients, aind);
+		FunctionGradient policyGradient = computePolicyGradient(qs, qGradients, aind, beta);
 
 		return policyGradient;
 
 	}
 
-	/**
-	 * Computes the gradient of a Boltzmann policy using values derived from a Differentiable Botlzmann backup valueFunction.
-	 * @param beta the Boltzmann beta parameter. This parameter is the inverse of the Botlzmann temperature. As beta becomes larger, the policy becomes more deterministic. Should lie in [0, +ifnty].
-	 * @param qs an array holding the Q-value for each action.
-	 * @param maxBetaScaled the maximum Q-value after being scaled by the parameter beta
-	 * @param logSum the log sum of the exponentiated q values
-	 * @param gqs a matrix holding the Q-value gradient for each action. The matrix's major order is the action index, followed by the parameter gradient
-	 * @param aInd the index of the query action for which the policy's gradient is being computed
-	 * @return the gradient of the policy.
-	 */
-	public static FunctionGradient computePolicyGradient(double beta, double [] qs, double maxBetaScaled, double logSum, FunctionGradient [] gqs, int aInd){
+    /**
+     * Computes the gradient of the Boltzmann (softmax) policy wrt some parameters.
+     * @param prefs the action-wise preference values passed through the softmax
+     * @param grads the gradients of the preference-values with respect to the parameters
+     * @param aind the index of the action for which the gradient is being queried
+     * @param beta the softmax beta parameter. This parameter is the inverse of the Botlzmann temperature. As beta becomes larger, the policy becomes more deterministic. Should lie in [0, +ifnty].
+     * @return the gradient of the policy
+     */
+	public static FunctionGradient computePolicyGradient(double [] prefs, FunctionGradient[] grads, int aind, double beta){
 
-		FunctionGradient pg = new FunctionGradient.SparseGradient();
-		double constantPart = beta * Math.exp(beta*qs[aInd] + maxBetaScaled - logSum - logSum);
-		Set<Integer> nzPDs = combinedNonZeroPDParameters(gqs);
-		for(int i = 0; i < qs.length; i++){
-			for(int param : nzPDs){
-				double curVal = pg.getPartialDerivative(param);
-				double nextVal = curVal + (gqs[aInd].getPartialDerivative(param) - gqs[i].getPartialDerivative(param))
-											* Math.exp(beta * qs[i] - maxBetaScaled);
+		//first compute policy probs
+		BoltzmannDistribution bd = new BoltzmannDistribution(prefs, 1./beta);
+		double [] probs = bd.getProbabilities();
 
-				pg.put(param, nextVal);
- 			}
-		}
+		return computePolicyGradient(probs, prefs, grads, aind, beta);
+
+	}
+
+    public static FunctionGradient computePolicyGradient(double [] probs, double [] prefs, FunctionGradient[] grads, int aind, double beta){
+
+        HashedAggregator<Integer> sums = new HashedAggregator<Integer>();
+
+        //now get component for on action gradient
+        FunctionGradient aterm = GradientUtils.scalarMultCopy(grads[aind], beta * (1. - probs[aind]));
+        GradientUtils.sumInto(aterm, sums);
+
+        //now sum over off action gradients
+        for(int i = 0; i < prefs.length; i++){
+            if(i == aind) continue;
+
+            FunctionGradient offTerm = GradientUtils.scalarMultCopy(grads[i], -beta * probs[i]);
+            GradientUtils.sumInto(offTerm, sums);
+        }
+
+        FunctionGradient unnormalized = GradientUtils.toGradient(sums);
+        FunctionGradient grad = GradientUtils.scalarMultCopy(unnormalized, probs[aind]);
+
+        return grad;
+
+    }
 
-		FunctionGradient finalGradient = new FunctionGradient.SparseGradient(pg.numNonZeroPDs());
-		for(FunctionGradient.PartialDerivative pd : pg.getNonZeroPartialDerivatives()){
-			double nextVal = pd.value * constantPart;
-			finalGradient.put(pd.parameterId, nextVal);
-		}
 
-		return finalGradient;
 
-	}
 
 
 	/**