Skip to content

Commit

Permalink
Close #1: Fix off by one mistake and add two more options to gradient…
Browse files Browse the repository at this point in the history
… bandit: grad_bandit_init_random and grad_bandit_reward_power.
  • Loading branch information
instance01 committed Aug 11, 2020
1 parent c3416e3 commit 3385916
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 15 deletions.
18 changes: 15 additions & 3 deletions src/cfg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@ json get_default(std::string base) {
{"n_desired_eval_len", 10},
{"bandit_type", "mcts"}, // mcts, grad
{"grad_bandit_alpha", 0.01},
{"use_eps_greedy_learning", false},
{"use_eps_greedy_learning", true},
{"eps_greedy_epsilon_decay_factor_train", 0.995},
{"eps_greedy_epsilon_decay_factor_actor", 0.995},
{"grad_bandit_init_random", true},
{"grad_bandit_reward_power", 1},

// Other
{"reward_exponent", 1},
Expand Down Expand Up @@ -101,7 +105,11 @@ json get_default(std::string base) {
{"n_desired_eval_len", 10},
{"bandit_type", "mcts"}, // mcts, grad
{"grad_bandit_alpha", 0.01},
{"use_eps_greedy_learning", false},
{"use_eps_greedy_learning", true},
{"eps_greedy_epsilon_decay_factor_train", 0.995},
{"eps_greedy_epsilon_decay_factor_actor", 0.995},
{"grad_bandit_init_random", true},
{"grad_bandit_reward_power", 1},

// Other
{"reward_exponent", 1},
Expand Down Expand Up @@ -146,7 +154,11 @@ json get_default(std::string base) {
{"n_desired_eval_len", 100},
{"bandit_type", "mcts"}, // mcts, grad
{"grad_bandit_alpha", 0.01},
{"use_eps_greedy_learning", false},
{"use_eps_greedy_learning", true},
{"eps_greedy_epsilon_decay_factor_train", 0.995},
{"eps_greedy_epsilon_decay_factor_actor", 0.995},
{"grad_bandit_init_random", true},
{"grad_bandit_reward_power", 1},

// Other
{"reward_exponent", 1},
Expand Down
36 changes: 24 additions & 12 deletions src/gradient_bandit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ GradientBanditSearch::GradientBanditSearch(EnvWrapper orig_env, A2CLearner a2c_a
horizon = std::min(horizon, orig_env.env->max_steps);
double alpha = params["dirichlet_alpha"];
double frac = params["dirichlet_frac"];
bool do_init_random = params["grad_bandit_init_random"];
reward_power = params["grad_bandit_reward_power"];

EnvWrapper env_ = *orig_env.clone();

Expand Down Expand Up @@ -115,15 +117,18 @@ GradientBanditSearch::GradientBanditSearch(EnvWrapper orig_env, A2CLearner a2c_a
break;
}

// In case we evaluated a very good path, add missing bandits with random initialization.
// In case we evaluated a very good path, add missing bandits (optionally with random
// initialization).
std::uniform_real_distribution<double> distribution_(0.0, 1.0);
for (int j = 0; j < (horizon - i - 1); ++j) {
std::vector<double> vec(n_actions);
std::generate(
vec.begin(),
vec.end(),
[distribution_, this] () mutable { return distribution_(this->generator); }
);
for (int j = 0; j < (horizon - i); ++j) {
std::vector<double> vec{0.33, 0.33, 0.33};
if (do_init_random) {
std::generate(
vec.begin(),
vec.end(),
[distribution_, this] () mutable { return distribution_(this->generator); }
);
}

// Create the bandit.
auto bandit = SingleGradientBandit(params);
Expand All @@ -142,8 +147,8 @@ GradientBanditSearch::policy(int i, EnvWrapper orig_env, std::vector<float> obs,

EnvWrapper env = *orig_env.clone();

// TODO Hmm.. It could be that horizon is set HIGHER than the maximum horizon of the
// environment. So, let's only loop until the size of bandits.
// It could be that horizon is set higher than the maximum horizon of the environment.
// So let's only loop until the size of bandits.
int j = i;
for (; j < bandits.size(); ++j) {
std::vector<double> action_probs;
Expand All @@ -157,10 +162,15 @@ GradientBanditSearch::policy(int i, EnvWrapper orig_env, std::vector<float> obs,
bool done;
std::tie(std::ignore, reward, done) = env.step(action);

reward = std::pow(reward, reward_power);
rewards.push_back(reward);

if (done)
if (done) {
// Since we break, the last ++j of the loop is not executed.
// To keep things consistent later on, let's do it manually.
j += 1;
break;
}
}

std::vector<double> cumulative_rewards;
Expand All @@ -171,7 +181,9 @@ GradientBanditSearch::policy(int i, EnvWrapper orig_env, std::vector<float> obs,
}
std::reverse(cumulative_rewards.begin(), cumulative_rewards.end());

for (int m = 0; m < j - i; ++m) {
// This had an off by one mistake. Refer to j += 1 a few lines above.
int size = std::min((int) bandits.size() - 1, j - i);
for (int m = 0; m < size; ++m) {
bandits[m + i].update(actions_probs_arr[m], actions[m], cumulative_rewards[m]);
}
}
Expand Down
1 change: 1 addition & 0 deletions src/gradient_bandit.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class GradientBanditSearch : public Bandit {
int n_iter;
int horizon;
std::mt19937 generator;
int reward_power;

std::vector<SingleGradientBandit> bandits;
EnvWrapper env;
Expand Down

0 comments on commit 3385916

Please sign in to comment.