In [1]:
import os
from gymhelpers import ExperimentsManager
import scipy.io as scipyio
import numpy as np
from collections import defaultdict

In [2]:
strategies = ["sparsemax","softmax","epsilon"]
backuprules = ["sparsebellman","softbellman","bellman"]
temperatures = [ 1, 0.01, 0.1]
temperatures_name = ["high", "low", "mid"]
action_res_list = [2001, 1001, 101, 3]
action_res_name = ["large","midlarge","midsmall","small"]

env_name = "InvertedDoublePendulum-v1"
min_avg_rwd = 9000
stop_training_min_avg_rwd = 9100
layers_size = [512, 512]
n_ep = 12000
n_exps = 3

gym_stats_dir_prefix = os.path.join('Gym_stats', env_name)
figures_dir = 'Figures'
api_key = '###'
alg_id = '###'

data = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda : defaultdict(lambda : None))))
for action_res, action_name in zip(action_res_list,action_res_name):
    for temperature, temperature_name in zip(temperatures, temperatures_name):
        for strategy in strategies:
            for backuprule in backuprules:
                print("Problem: {}, Actions: {}, Temp: {}, Strategy: {}, Backup: {}".format(env_name,np.prod(action_res),temperature,strategy,backuprule))
                expsman = ExperimentsManager(env_name=env_name, agent_value_function_hidden_layers_size=layers_size,
                                     figures_dir=figures_dir, discount=0.99, decay_eps=0.995, eps_min=1E-4, learning_rate=3E-4,
                                     decay_lr=True, max_step=2000, replay_memory_max_size=100000, ep_verbose=False,
                                     exp_verbose=False, learning_rate_end=3E-5, batch_size=64, upload_last_exp=False, double_dqn=True, dueling=False,
                                     target_params_update_period_steps=75, replay_period_steps=4, min_avg_rwd=min_avg_rwd,
                                     per_proportional_prioritization=True, per_apply_importance_sampling=True, per_alpha=0.2,
                                     per_beta0=0.4,
                                     results_dir_prefix=gym_stats_dir_prefix, gym_api_key=api_key, gym_algorithm_id=alg_id,
                                     strategy=strategy,backuprule=backuprule,temperature=temperature,action_res=31)
                _, _, Rwd_per_ep_v, Loss_per_ep_v = expsman.run_experiments(n_exps=n_exps, n_ep=n_ep, stop_training_min_avg_rwd=stop_training_min_avg_rwd, plot_results=False)
                data[action_name][temperature_name][strategy][backuprule] = {"reward_list":Rwd_per_ep_v,"loss_list":Loss_per_ep_v}

scipyio.savemat(env_name+".mat", data)
print("{} is finished and is saved".format(env_name))

Problem: InvertedDoublePendulum-v1, Actions: 2001, Temp: 1, Strategy: sparsemax, Backup: sparsebellman


[2017-08-01 19:05:49,475] Making new env: InvertedDoublePendulum-v1
[2017-08-01 19:05:49,683] Making new env: InvertedDoublePendulum-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.
Final mean reward, averaged over 1 experiment: 9160.10323838 (std = 0.0).


  factor = higher_border / mu
  lower_border = np.clip(lower_border, mu / factor, mu)
[2017-08-01 20:28:28,393] Making new env: InvertedDoublePendulum-v1


Average episode duration: 412.871863 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.
Final mean reward, averaged over 2 experiments: 9251.07976582 (std = 90.9765274429).


[2017-08-01 21:49:53,835] Making new env: InvertedDoublePendulum-v1


Average episode duration: 406.772855 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.
Final mean reward, averaged over 3 experiments: 9279.05629838 (std = 84.1617001847).
Average episode duration: 397.518740 ms
Average final reward: 9279.67 (std=426.65).

The 100-episode moving average reached 9000 after 1943 episodes.
Problem: InvertedDoublePendulum-v1, Actions: 2001, Temp: 1, Strategy: sparsemax, Backup: softbellman


[2017-08-01 23:09:34,666] Making new env: InvertedDoublePendulum-v1
[2017-08-01 23:09:34,670] Making new env: InvertedDoublePendulum-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.
Final mean reward, averaged over 1 experiment: 8871.40596794 (std = 0.0).


[2017-08-02 00:29:06,943] Making new env: InvertedDoublePendulum-v1


Average episode duration: 397.353028 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.
Final mean reward, averaged over 2 experiments: 9074.95245163 (std = 203.546483686).


[2017-08-02 04:30:06,407] Making new env: InvertedDoublePendulum-v1


Average episode duration: 1204.613655 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.
Final mean reward, averaged over 3 experiments: 8780.1395584 (std = 448.831889144).
Average episode duration: 384.325171 ms
Average final reward: 8785.67 (std=990.47).

The 100-episode moving average reached 9000 after 8565 episodes.
Problem: InvertedDoublePendulum-v1, Actions: 2001, Temp: 1, Strategy: sparsemax, Backup: bellman


[2017-08-02 05:47:09,176] Making new env: InvertedDoublePendulum-v1
[2017-08-02 05:47:09,180] Making new env: InvertedDoublePendulum-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.
Final mean reward, averaged over 1 experiment: 8286.05543903 (std = 0.0).


[2017-08-02 11:40:29,817] Making new env: InvertedDoublePendulum-v1


Average episode duration: 1766.375101 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.
Final mean reward, averaged over 2 experiments: 8808.41627274 (std = 522.360833704).


[2017-08-02 13:00:48,972] Making new env: InvertedDoublePendulum-v1


Average episode duration: 401.258812 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.
Final mean reward, averaged over 3 experiments: 8984.45772454 (std = 493.850597415).
Average episode duration: 467.962520 ms
Average final reward: 8987.93 (std=852.75).

The 100-episode moving average reached 9000 after 2155 episodes.
Problem: InvertedDoublePendulum-v1, Actions: 2001, Temp: 1, Strategy: softmax, Backup: sparsebellman


[2017-08-02 14:34:35,561] Making new env: InvertedDoublePendulum-v1
[2017-08-02 14:34:35,565] Making new env: InvertedDoublePendulum-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.
Final mean reward, averaged over 1 experiment: 9353.23519665 (std = 0.0).


[2017-08-02 15:55:52,007] Making new env: InvertedDoublePendulum-v1


Average episode duration: 406.030935 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.
Final mean reward, averaged over 2 experiments: 9352.23143759 (std = 1.00375906327).


[2017-08-02 17:16:21,862] Making new env: InvertedDoublePendulum-v1


Average episode duration: 402.143843 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.
Final mean reward, averaged over 3 experiments: 9258.88401286 (std = 132.015738065).
Average episode duration: 399.933634 ms
Average final reward: 9259.82 (std=526.09).

The 100-episode moving average reached 9000 after 1932 episodes.
Problem: InvertedDoublePendulum-v1, Actions: 2001, Temp: 1, Strategy: softmax, Backup: softbellman


[2017-08-02 18:36:31,719] Making new env: InvertedDoublePendulum-v1
[2017-08-02 18:36:31,723] Making new env: InvertedDoublePendulum-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT InvertedDoublePendulum-v1.
Minimum average reward reached. Stop training and exploration.


KeyboardInterrupt: 