In [None]:
import os
from gymhelpers import ExperimentsManager
import scipy.io as scipyio
import numpy as np
from collections import defaultdict

In [None]:
strategies = ["epsilon","sparsemax","softmax"]
backuprules = ["bellman","sparsebellman","softbellman"]
temperatures = [0.01,0.1,1]
temperatures_name = ["low","mid","high"]
action_res_list = [[31, 31], [51, 51], [11, 11], [3,3]]
action_res_name = ["midlarge","large","midsmall","small"]

env_name = "LunarLanderContinuous-v2"
min_avg_rwd = 200
stop_training_min_avg_rwd = 210
layers_size = [512, 512]
n_ep = 4000
n_exps = 3

gym_stats_dir_prefix = os.path.join('Gym_stats', env_name)
figures_dir = 'Figures'
api_key = '###'
alg_id = '###'

data = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda : defaultdict(lambda : None))))
for action_res, action_name in zip(action_res_list,action_res_name):
    for temperature, temperature_name in zip(temperatures, temperatures_name):
        for strategy in strategies:
            for backuprule in backuprules:
                print("Problem: {}, Actions: {}, Temp: {}, Strategy: {}, Backup: {}".format(env_name,np.prod(action_res),temperature,strategy,backuprule))
                expsman = ExperimentsManager(env_name=env_name, agent_value_function_hidden_layers_size=layers_size,
                                      figures_dir=figures_dir, discount=0.99, decay_eps=0.999, eps_min=1E-4, learning_rate=3E-4,
                                      decay_lr=True, max_step=10000, replay_memory_max_size=10000, ep_verbose=False,
                                      exp_verbose=False, learning_rate_end=3E-5, batch_size=128, upload_last_exp=False, double_dqn=True, dueling=False,
                                      target_params_update_period_steps=50, replay_period_steps=4, min_avg_rwd=min_avg_rwd,
                                      per_proportional_prioritization=True, per_apply_importance_sampling=True, per_alpha=0.2,
                                      per_beta0=0.4,
                                      results_dir_prefix=gym_stats_dir_prefix, gym_api_key=api_key, gym_algorithm_id=alg_id,
                                      strategy=strategy,backuprule=backuprule,temperature=temperature,action_res=action_res)
                _, _, Rwd_per_ep_v, Loss_per_ep_v = expsman.run_experiments(n_exps=n_exps, n_ep=n_ep, stop_training_min_avg_rwd=stop_training_min_avg_rwd, plot_results=False)
                data[action_name][temperature_name][strategy][backuprule] = {"reward_list":Rwd_per_ep_v,"loss_list":Loss_per_ep_v}

scipyio.savemat(env_name+".mat", data)
print("{} is finished and is saved".format(env_name))

Problem: LunarLanderContinuous-v2, Actions: 961, Temp: 0.01, Strategy: epsilon, Backup: bellman


[2017-07-31 19:01:05,479] Making new env: LunarLanderContinuous-v2
[2017-07-31 19:01:05,495] Making new env: LunarLanderContinuous-v2



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT LunarLanderContinuous-v2.
Final mean reward, averaged over 1 experiment: -633.818483492 (std = 0.0).


  factor = higher_border / mu
[2017-07-31 19:42:03,280] Making new env: LunarLanderContinuous-v2


Average episode duration: 613.885409 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT LunarLanderContinuous-v2.
Final mean reward, averaged over 2 experiments: -559.074481364 (std = 74.7440021281).


[2017-07-31 20:53:44,485] Making new env: LunarLanderContinuous-v2


Average episode duration: 1074.729788 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT LunarLanderContinuous-v2.
Final mean reward, averaged over 3 experiments: -594.379488972 (std = 78.8500540726).
Average episode duration: 737.497257 ms
Average final reward: -590.46 (std=246.74).

The 100-episode moving average reached 200 after 0 episodes.
Problem: LunarLanderContinuous-v2, Actions: 961, Temp: 0.01, Strategy: epsilon, Backup: sparsebellman


[2017-07-31 21:43:00,746] Making new env: LunarLanderContinuous-v2
[2017-07-31 21:43:00,751] Making new env: LunarLanderContinuous-v2



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT LunarLanderContinuous-v2.
Final mean reward, averaged over 1 experiment: -477.73662309 (std = 0.0).


[2017-07-31 22:26:20,136] Making new env: LunarLanderContinuous-v2


Average episode duration: 649.281170 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT LunarLanderContinuous-v2.
Final mean reward, averaged over 2 experiments: -518.124396063 (std = 40.387772973).


[2017-07-31 23:09:39,800] Making new env: LunarLanderContinuous-v2


Average episode duration: 649.361486 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT LunarLanderContinuous-v2.
Final mean reward, averaged over 3 experiments: -557.059398135 (std = 64.1819048473).
Average episode duration: 630.546128 ms
Average final reward: -554.10 (std=160.71).

The 100-episode moving average reached 200 after 0 episodes.
Problem: LunarLanderContinuous-v2, Actions: 961, Temp: 0.01, Strategy: epsilon, Backup: softbellman


[2017-07-31 23:51:48,146] Making new env: LunarLanderContinuous-v2
[2017-07-31 23:51:48,150] Making new env: LunarLanderContinuous-v2



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT LunarLanderContinuous-v2.
Final mean reward, averaged over 1 experiment: -565.940974225 (std = 0.0).


[2017-08-01 00:37:46,066] Making new env: LunarLanderContinuous-v2


Average episode duration: 688.912735 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT LunarLanderContinuous-v2.
Final mean reward, averaged over 2 experiments: -576.200585172 (std = 10.259610947).


[2017-08-01 01:19:53,915] Making new env: LunarLanderContinuous-v2


Average episode duration: 631.418615 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT LunarLanderContinuous-v2.
Final mean reward, averaged over 3 experiments: -579.450979406 (std = 9.55527096477).
Average episode duration: 816.729990 ms
Average final reward: -578.89 (std=164.08).

The 100-episode moving average reached 200 after 0 episodes.
Problem: LunarLanderContinuous-v2, Actions: 961, Temp: 0.01, Strategy: sparsemax, Backup: bellman


[2017-08-01 02:14:27,078] Making new env: LunarLanderContinuous-v2
[2017-08-01 02:14:27,083] Making new env: LunarLanderContinuous-v2



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT LunarLanderContinuous-v2.


  p = p/p.sum()
  a = np.random.choice(self.n_actions, p=policy)


Final mean reward, averaged over 1 experiment: -976.126264899 (std = 0.0).


[2017-08-01 05:53:43,503] Making new env: LunarLanderContinuous-v2


Average episode duration: 3288.561200 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT LunarLanderContinuous-v2.
Final mean reward, averaged over 2 experiments: -631.627077441 (std = 344.499187457).


[2017-08-01 10:58:14,411] Making new env: LunarLanderContinuous-v2


Average episode duration: 4567.169013 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT LunarLanderContinuous-v2.
