In [1]:
import os
from gymhelpers import ExperimentsManager
import scipy.io as scipyio
import numpy as np
from collections import defaultdict

In [2]:
strategies = ["epsilon","sparsemax","softmax"]
backuprules = ["bellman","sparsebellman","softbellman"]
temperatures = [0.01,0.1,1]
temperatures_name = ["low","mid","high"]
action_res_list = [[31, 31], [51, 51], [11, 11], [3,3]]
action_res_name = ["midlarge","large","midsmall","small"]

env_name = "Reacher-v1"
min_avg_rwd = -3.5
stop_training_min_avg_rwd = -3.75
layers_size = [256, 256, 256, 256]
n_ep = 10000
n_exps = 3

gym_stats_dir_prefix = os.path.join('Gym_stats', env_name)
figures_dir = 'Figures'
api_key = '###'
alg_id = '###'

data = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda : defaultdict(lambda : None))))
for action_res, action_name in zip(action_res_list,action_res_name):
    for temperature, temperature_name in zip(temperatures, temperatures_name):
        for strategy in strategies:
            for backuprule in backuprules:
                print("Problem: {}, Actions: {}, Temp: {}, Strategy: {}, Backup: {}".format(env_name,np.prod(action_res),temperature,strategy,backuprule))
                expsman = ExperimentsManager(env_name=env_name, agent_value_function_hidden_layers_size=layers_size,
                                      figures_dir=figures_dir, discount=0.99, decay_eps=0.999, eps_min=1E-4, learning_rate=3E-4,
                                      decay_lr=True, max_step=10000, replay_memory_max_size=10000, ep_verbose=False,
                                      exp_verbose=False, learning_rate_end=3E-5, batch_size=128, upload_last_exp=False, double_dqn=True, dueling=False,
                                      target_params_update_period_steps=50, replay_period_steps=4, min_avg_rwd=min_avg_rwd,
                                      per_proportional_prioritization=True, per_apply_importance_sampling=True, per_alpha=0.2,
                                      per_beta0=0.4,
                                      results_dir_prefix=gym_stats_dir_prefix, gym_api_key=api_key, gym_algorithm_id=alg_id,
                                      strategy=strategy,backuprule=backuprule,temperature=temperature,action_res=action_res)
                _, _, Rwd_per_ep_v, Loss_per_ep_v = expsman.run_experiments(n_exps=n_exps, n_ep=n_ep, stop_training_min_avg_rwd=stop_training_min_avg_rwd, plot_results=False)
                data[action_name][temperature_name][strategy][backuprule] = {"reward_list":Rwd_per_ep_v,"loss_list":Loss_per_ep_v}

scipyio.savemat(env_name+".mat", data)
print("{} is finished and is saved".format(env_name))

Problem: Reacher-v1, Actions: 961, Temp: 0.01, Strategy: epsilon, Backup: bellman


[2017-07-19 17:21:42,880] Making new env: Reacher-v1
[2017-07-19 17:21:43,076] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.14196741513 (std = 0.0).


  factor = higher_border / mu
[2017-07-19 17:46:47,883] Making new env: Reacher-v1


Average episode duration: 149.982422 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -9.84963664741 (std = 3.70766923229).


[2017-07-19 18:12:06,689] Making new env: Reacher-v1


Average episode duration: 151.408106 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -8.40927429789 (std = 3.64881188241).
Average episode duration: 150.629126 ms
Average final reward: -8.42 (std=2.22).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.01, Strategy: epsilon, Backup: sparsebellman


[2017-07-19 18:37:27,468] Making new env: Reacher-v1
[2017-07-19 18:37:27,487] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.78214733866 (std = 0.0).


[2017-07-19 19:02:55,856] Making new env: Reacher-v1


Average episode duration: 152.406576 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.64210180986 (std = 0.1400455288).


[2017-07-19 19:28:36,964] Making new env: Reacher-v1


Average episode duration: 153.660438 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.52578215674 (std = 0.200338938209).
Average episode duration: 148.482311 ms
Average final reward: -5.53 (std=1.58).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.01, Strategy: epsilon, Backup: softbellman


[2017-07-19 19:53:36,241] Making new env: Reacher-v1
[2017-07-19 19:53:36,247] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -13.6772879112 (std = 0.0).


[2017-07-19 20:19:17,564] Making new env: Reacher-v1


Average episode duration: 153.664866 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -10.1998084974 (std = 3.47747941376).


[2017-07-19 20:44:13,158] Making new env: Reacher-v1


Average episode duration: 149.110859 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -12.0824045457 (std = 3.89233149637).
Average episode duration: 151.062666 ms
Average final reward: -12.09 (std=2.75).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.01, Strategy: sparsemax, Backup: bellman


[2017-07-19 21:09:38,949] Making new env: Reacher-v1
[2017-07-19 21:09:39,232] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.90467178744 (std = 0.0).


[2017-07-19 22:10:47,231] Making new env: Reacher-v1


Average episode duration: 366.276452 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.79760564272 (std = 0.107066144723).


[2017-07-19 23:11:54,992] Making new env: Reacher-v1


Average episode duration: 366.267865 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.86709492717 (std = 0.131528048483).
Average episode duration: 366.390885 ms
Average final reward: -5.89 (std=1.64).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.01, Strategy: sparsemax, Backup: sparsebellman


[2017-07-20 00:13:13,353] Making new env: Reacher-v1
[2017-07-20 00:13:13,359] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.75962638465 (std = 0.0).


[2017-07-20 01:14:18,202] Making new env: Reacher-v1


Average episode duration: 365.983974 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -10.0202513195 (std = 4.26062493483).


[2017-07-20 02:15:24,607] Making new env: Reacher-v1


Average episode duration: 366.180261 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -8.59456990542 (std = 4.02083136664).
Average episode duration: 364.464239 ms
Average final reward: -8.63 (std=1.95).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.01, Strategy: sparsemax, Backup: softbellman


[2017-07-20 03:16:23,685] Making new env: Reacher-v1
[2017-07-20 03:16:23,972] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.09881413559 (std = 0.0).


[2017-07-20 04:16:49,396] Making new env: Reacher-v1


Average episode duration: 362.092365 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.95467644999 (std = 0.144137685597).


[2017-07-20 05:17:16,866] Making new env: Reacher-v1


Average episode duration: 362.283827 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -8.84934284911 (std = 4.09536781897).
Average episode duration: 363.343298 ms
Average final reward: -8.87 (std=1.87).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.01, Strategy: softmax, Backup: bellman


[2017-07-20 06:18:04,771] Making new env: Reacher-v1
[2017-07-20 06:18:05,048] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.45191096253 (std = 0.0).


[2017-07-20 06:43:56,623] Making new env: Reacher-v1


Average episode duration: 154.701690 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.27215648553 (std = 0.179754477005).


[2017-07-20 07:09:20,053] Making new env: Reacher-v1


Average episode duration: 151.846642 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.2817163958 (std = 0.147390292889).
Average episode duration: 151.830512 ms
Average final reward: -5.29 (std=1.45).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.01, Strategy: softmax, Backup: sparsebellman


[2017-07-20 07:34:52,935] Making new env: Reacher-v1
[2017-07-20 07:34:52,941] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.03094931834 (std = 0.0).


[2017-07-20 08:00:38,396] Making new env: Reacher-v1


Average episode duration: 154.059802 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.61250344996 (std = 0.418445868384).


[2017-07-20 08:26:31,761] Making new env: Reacher-v1


Average episode duration: 154.860857 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.49594492997 (std = 0.379345586074).
Average episode duration: 152.794421 ms
Average final reward: -5.51 (std=1.40).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.01, Strategy: softmax, Backup: softbellman


[2017-07-20 08:52:14,710] Making new env: Reacher-v1
[2017-07-20 08:52:14,717] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.31132766072 (std = 0.0).


[2017-07-20 09:17:29,549] Making new env: Reacher-v1


Average episode duration: 151.035426 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.4710129663 (std = 0.159685305572).


[2017-07-20 09:42:54,069] Making new env: Reacher-v1


Average episode duration: 151.947925 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.50697953124 (std = 0.139952797274).
Average episode duration: 151.368601 ms
Average final reward: -5.53 (std=1.44).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.1, Strategy: epsilon, Backup: bellman


[2017-07-20 10:08:22,454] Making new env: Reacher-v1
[2017-07-20 10:08:22,459] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.79827299035 (std = 0.0).


[2017-07-20 10:33:46,479] Making new env: Reacher-v1


Average episode duration: 151.877996 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.05613595762 (std = 0.742137032731).


[2017-07-20 10:59:30,240] Making new env: Reacher-v1


Average episode duration: 153.888652 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.89916964552 (std = 0.645333322013).
Average episode duration: 155.986498 ms
Average final reward: -5.90 (std=1.65).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.1, Strategy: epsilon, Backup: sparsebellman


[2017-07-20 11:25:46,996] Making new env: Reacher-v1
[2017-07-20 11:25:47,002] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.52806850221 (std = 0.0).


[2017-07-20 11:52:04,939] Making new env: Reacher-v1


Average episode duration: 157.345303 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.86341867716 (std = 0.664649825058).


[2017-07-20 12:17:31,144] Making new env: Reacher-v1


Average episode duration: 152.124702 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.76698383656 (std = 0.559558412437).
Average episode duration: 150.374733 ms
Average final reward: -5.77 (std=1.63).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.1, Strategy: epsilon, Backup: softbellman


[2017-07-20 12:42:49,145] Making new env: Reacher-v1
[2017-07-20 12:42:49,151] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.63946327479 (std = 0.0).


[2017-07-20 13:07:58,191] Making new env: Reacher-v1


Average episode duration: 150.421368 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.66158114366 (std = 0.0221178688721).


[2017-07-20 13:32:50,189] Making new env: Reacher-v1


Average episode duration: 148.764736 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.63500108351 (std = 0.0417029089081).
Average episode duration: 149.504130 ms
Average final reward: -5.64 (std=1.50).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.1, Strategy: sparsemax, Backup: bellman


[2017-07-20 13:57:59,786] Making new env: Reacher-v1
[2017-07-20 13:57:59,792] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.72332978563 (std = 0.0).


[2017-07-20 14:58:33,632] Making new env: Reacher-v1


Average episode duration: 362.908150 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.53721096553 (std = 0.186118820102).


[2017-07-20 15:59:11,098] Making new env: Reacher-v1


Average episode duration: 363.258433 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.40264033387 (std = 0.243540523543).
Average episode duration: 364.408632 ms
Average final reward: -5.40 (std=1.61).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.1, Strategy: sparsemax, Backup: sparsebellman


[2017-07-20 17:00:10,581] Making new env: Reacher-v1
[2017-07-20 17:00:10,587] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.35367031624 (std = 0.0).


[2017-07-20 18:00:45,814] Making new env: Reacher-v1


Average episode duration: 363.038133 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.2753154271 (std = 0.0783548891387).


[2017-07-20 19:01:37,181] Making new env: Reacher-v1


Average episode duration: 364.624979 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.37101952488 (std = 0.14970484661).
Average episode duration: 362.967354 ms
Average final reward: -5.40 (std=1.58).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.1, Strategy: sparsemax, Backup: softbellman


[2017-07-20 20:02:21,231] Making new env: Reacher-v1
[2017-07-20 20:02:21,237] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.70260775046 (std = 0.0).


[2017-07-20 21:02:44,306] Making new env: Reacher-v1


Average episode duration: 361.810065 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.39044888353 (std = 0.312158866931).


[2017-07-20 22:03:20,378] Making new env: Reacher-v1


Average episode duration: 363.101786 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.48205311912 (std = 0.285910436022).
Average episode duration: 364.982225 ms
Average final reward: -5.48 (std=1.59).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.1, Strategy: softmax, Backup: bellman


[2017-07-20 23:04:26,879] Making new env: Reacher-v1
[2017-07-20 23:04:26,886] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -23.5189023247 (std = 0.0).


[2017-07-20 23:29:14,087] Making new env: Reacher-v1


Average episode duration: 148.217175 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -27.2905649471 (std = 3.77166262239).


[2017-07-20 23:54:08,278] Making new env: Reacher-v1


Average episode duration: 148.918866 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -26.6624633505 (std = 3.20509737596).
Average episode duration: 148.576247 ms
Average final reward: -26.68 (std=2.91).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.1, Strategy: softmax, Backup: sparsebellman


[2017-07-21 00:19:09,465] Making new env: Reacher-v1
[2017-07-21 00:19:09,471] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -21.780332305 (std = 0.0).


[2017-07-21 00:43:58,630] Making new env: Reacher-v1


Average episode duration: 148.386228 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -20.3307186853 (std = 1.44961361965).


[2017-07-21 01:08:49,772] Making new env: Reacher-v1


Average episode duration: 148.563816 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -20.7736428094 (std = 1.33913528951).
Average episode duration: 149.114244 ms
Average final reward: -20.77 (std=2.54).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 0.1, Strategy: softmax, Backup: softbellman


[2017-07-21 01:33:55,418] Making new env: Reacher-v1
[2017-07-21 01:33:55,424] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -16.2767721001 (std = 0.0).


[2017-07-21 01:59:32,469] Making new env: Reacher-v1


Average episode duration: 153.232225 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -23.10302092 (std = 6.82624881984).


[2017-07-21 02:24:33,407] Making new env: Reacher-v1


Average episode duration: 149.378669 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -23.8402841712 (std = 5.6702935994).
Average episode duration: 149.707580 ms
Average final reward: -23.90 (std=3.26).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 1, Strategy: epsilon, Backup: bellman


[2017-07-21 02:49:45,261] Making new env: Reacher-v1
[2017-07-21 02:49:45,538] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.80130239937 (std = 0.0).


[2017-07-21 03:14:22,401] Making new env: Reacher-v1


Average episode duration: 147.222780 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.53308791951 (std = 0.268214479868).


[2017-07-21 03:39:25,082] Making new env: Reacher-v1


Average episode duration: 149.785970 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.67109845067 (std = 0.293348174642).
Average episode duration: 147.319731 ms
Average final reward: -5.68 (std=1.61).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 1, Strategy: epsilon, Backup: sparsebellman


[2017-07-21 04:04:13,535] Making new env: Reacher-v1
[2017-07-21 04:04:13,540] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.83107458263 (std = 0.0).


[2017-07-21 04:29:11,790] Making new env: Reacher-v1


Average episode duration: 149.391749 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.58333529511 (std = 0.247739287518).


[2017-07-21 04:54:23,946] Making new env: Reacher-v1


Average episode duration: 150.782049 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.54703498209 (std = 0.208690988075).
Average episode duration: 149.064038 ms
Average final reward: -5.56 (std=1.48).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 1, Strategy: epsilon, Backup: softbellman


[2017-07-21 05:19:29,090] Making new env: Reacher-v1
[2017-07-21 05:19:29,096] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.11562900412 (std = 0.0).


[2017-07-21 05:43:59,121] Making new env: Reacher-v1


Average episode duration: 146.511555 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.64469682837 (std = 0.470932175751).


[2017-07-21 06:08:35,556] Making new env: Reacher-v1


Average episode duration: 147.200840 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.69991020266 (std = 0.392362642002).
Average episode duration: 147.134330 ms
Average final reward: -5.71 (std=1.70).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 1, Strategy: sparsemax, Backup: bellman


[2017-07-21 06:33:22,009] Making new env: Reacher-v1
[2017-07-21 06:33:22,054] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.77579396773 (std = 0.0).


[2017-07-21 07:35:08,178] Making new env: Reacher-v1


Average episode duration: 370.160915 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.6244479486 (std = 0.151346019125).


[2017-07-21 08:36:44,982] Making new env: Reacher-v1


Average episode duration: 369.239537 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.63996729869 (std = 0.125507418602).
Average episode duration: 370.179116 ms
Average final reward: -5.65 (std=1.39).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 1, Strategy: sparsemax, Backup: sparsebellman


[2017-07-21 09:38:41,116] Making new env: Reacher-v1
[2017-07-21 09:38:41,122] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -19.46223621 (std = 0.0).


[2017-07-21 10:40:01,955] Making new env: Reacher-v1


Average episode duration: 367.609798 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -12.4105417326 (std = 7.05169447744).


[2017-07-21 11:41:54,679] Making new env: Reacher-v1


Average episode duration: 370.803733 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -10.1329284471 (std = 6.5974218115).
Average episode duration: 370.151404 ms
Average final reward: -10.14 (std=1.94).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 1, Strategy: sparsemax, Backup: softbellman


[2017-07-21 12:43:50,403] Making new env: Reacher-v1
[2017-07-21 12:43:50,408] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.62395762952 (std = 0.0).


[2017-07-21 13:45:22,672] Making new env: Reacher-v1


Average episode duration: 368.725889 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -10.6860240996 (std = 5.06206647006).


[2017-07-21 14:46:40,689] Making new env: Reacher-v1


Average episode duration: 367.308562 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.4999642745 (std = 4.2904555137).
Average episode duration: 367.651074 ms
Average final reward: -11.54 (std=2.05).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 1, Strategy: softmax, Backup: bellman


[2017-07-21 15:48:12,577] Making new env: Reacher-v1
[2017-07-21 15:48:12,583] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -50.1315862588 (std = 0.0).


[2017-07-21 16:13:38,888] Making new env: Reacher-v1


Average episode duration: 152.175130 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -44.6514359637 (std = 5.48015029507).


[2017-07-21 16:38:41,899] Making new env: Reacher-v1


Average episode duration: 149.846337 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -44.2003739454 (std = 4.51976533976).
Average episode duration: 152.742423 ms
Average final reward: -44.17 (std=4.48).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 1, Strategy: softmax, Backup: sparsebellman


[2017-07-21 17:04:25,054] Making new env: Reacher-v1
[2017-07-21 17:04:25,338] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -36.8288235858 (std = 0.0).


[2017-07-21 17:29:32,378] Making new env: Reacher-v1


Average episode duration: 150.207590 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -25.9261178512 (std = 10.9027057346).


[2017-07-21 17:54:54,011] Making new env: Reacher-v1


Average episode duration: 151.721870 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -29.6240784152 (std = 10.3245251492).
Average episode duration: 153.869635 ms
Average final reward: -29.64 (std=2.56).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 961, Temp: 1, Strategy: softmax, Backup: softbellman


[2017-07-21 18:20:47,626] Making new env: Reacher-v1
[2017-07-21 18:20:47,631] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -44.8068378782 (std = 0.0).


[2017-07-21 18:45:47,710] Making new env: Reacher-v1


Average episode duration: 149.556157 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -47.2641520747 (std = 2.45731419654).


[2017-07-21 19:10:58,451] Making new env: Reacher-v1


Average episode duration: 150.592948 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -46.7024307556 (std = 2.15792888074).
Average episode duration: 148.986513 ms
Average final reward: -46.75 (std=6.02).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.01, Strategy: epsilon, Backup: bellman


[2017-07-21 19:36:02,704] Making new env: Reacher-v1
[2017-07-21 19:36:02,745] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.49564586424 (std = 0.0).


[2017-07-21 20:10:26,871] Making new env: Reacher-v1


Average episode duration: 205.973958 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.62333555982 (std = 0.127689695577).


[2017-07-21 20:44:42,266] Making new env: Reacher-v1


Average episode duration: 205.089292 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -6.5011999934 (std = 0.201752237674).
Average episode duration: 206.772834 ms
Average final reward: -6.51 (std=2.15).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.01, Strategy: epsilon, Backup: sparsebellman


[2017-07-21 21:19:25,742] Making new env: Reacher-v1
[2017-07-21 21:19:26,026] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.71921426808 (std = 0.0).


[2017-07-21 21:53:42,902] Making new env: Reacher-v1


Average episode duration: 205.215010 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.95658494565 (std = 0.237370677573).


[2017-07-21 22:27:45,514] Making new env: Reacher-v1


Average episode duration: 203.799718 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.9994854124 (std = 0.203086498344).
Average episode duration: 208.391271 ms
Average final reward: -6.02 (std=1.79).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.01, Strategy: epsilon, Backup: softbellman


[2017-07-21 23:02:45,041] Making new env: Reacher-v1
[2017-07-21 23:02:45,380] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.11802607351 (std = 0.0).


[2017-07-21 23:36:59,205] Making new env: Reacher-v1


Average episode duration: 204.853018 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.0831470806 (std = 4.96512100712).


[2017-07-22 00:11:17,946] Making new env: Reacher-v1


Average episode duration: 205.427293 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -9.46649165918 (std = 4.65425617906).
Average episode duration: 205.713694 ms
Average final reward: -9.49 (std=2.26).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.01, Strategy: sparsemax, Backup: bellman


[2017-07-22 00:45:49,749] Making new env: Reacher-v1
[2017-07-22 00:45:49,756] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -15.4569065008 (std = 0.0).


[2017-07-22 03:02:56,997] Making new env: Reacher-v1


Average episode duration: 822.269021 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -10.986788545 (std = 4.47011795579).


[2017-07-22 05:19:08,574] Making new env: Reacher-v1


Average episode duration: 816.688405 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -9.4101462968 (std = 4.27702052645).
Average episode duration: 817.556409 ms
Average final reward: -9.41 (std=2.51).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.01, Strategy: sparsemax, Backup: sparsebellman


[2017-07-22 07:35:39,329] Making new env: Reacher-v1
[2017-07-22 07:35:39,336] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.64365510909 (std = 0.0).


[2017-07-22 09:52:40,180] Making new env: Reacher-v1


Average episode duration: 821.613548 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.20215996474 (std = 0.441495144349).


[2017-07-22 12:09:37,631] Making new env: Reacher-v1


Average episode duration: 821.264352 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -6.26265456355 (std = 0.370492241879).
Average episode duration: 830.714451 ms
Average final reward: -6.26 (std=2.00).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.01, Strategy: sparsemax, Backup: softbellman


[2017-07-22 14:28:22,810] Making new env: Reacher-v1
[2017-07-22 14:28:22,862] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.71228159644 (std = 0.0).


[2017-07-22 16:45:01,537] Making new env: Reacher-v1


Average episode duration: 819.372472 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.48559902238 (std = 0.226682574065).


[2017-07-22 19:02:26,983] Making new env: Reacher-v1


Average episode duration: 824.078710 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -6.31914721749 (std = 0.299447935858).
Average episode duration: 823.731488 ms
Average final reward: -6.36 (std=2.07).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.01, Strategy: softmax, Backup: bellman


[2017-07-22 21:19:59,683] Making new env: Reacher-v1
[2017-07-22 21:19:59,966] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -18.9911740568 (std = 0.0).


[2017-07-22 21:55:04,216] Making new env: Reacher-v1


Average episode duration: 209.992463 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -12.30012015 (std = 6.69105390683).


[2017-07-22 22:29:55,662] Making new env: Reacher-v1


Average episode duration: 208.638969 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -10.2593100985 (std = 6.17872259627).
Average episode duration: 209.109513 ms
Average final reward: -10.24 (std=2.54).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.01, Strategy: softmax, Backup: sparsebellman


[2017-07-22 23:05:01,601] Making new env: Reacher-v1
[2017-07-22 23:05:01,608] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.58712591079 (std = 0.0).


[2017-07-22 23:40:22,728] Making new env: Reacher-v1


Average episode duration: 211.638972 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.84043932236 (std = 0.253313411567).


[2017-07-23 00:16:03,085] Making new env: Reacher-v1


Average episode duration: 213.593603 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -9.14036475629 (std = 4.67138033083).
Average episode duration: 209.336278 ms
Average final reward: -9.16 (std=2.42).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.01, Strategy: softmax, Backup: softbellman


[2017-07-23 00:51:11,051] Making new env: Reacher-v1
[2017-07-23 00:51:11,341] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.71267279042 (std = 0.0).


[2017-07-23 01:25:58,018] Making new env: Reacher-v1


Average episode duration: 208.191560 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.4227832704 (std = 0.289889520021).


[2017-07-23 02:01:13,217] Making new env: Reacher-v1


Average episode duration: 210.757042 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -7.6337894223 (std = 1.72890015784).
Average episode duration: 211.146473 ms
Average final reward: -7.65 (std=1.92).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.1, Strategy: epsilon, Backup: bellman


[2017-07-23 02:36:39,794] Making new env: Reacher-v1
[2017-07-23 02:36:40,077] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.30467476887 (std = 0.0).


[2017-07-23 03:11:02,113] Making new env: Reacher-v1


Average episode duration: 205.697424 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.12624607473 (std = 0.178428694144).


[2017-07-23 03:45:05,534] Making new env: Reacher-v1


Average episode duration: 203.870443 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.91989552857 (std = 0.326168098379).
Average episode duration: 207.002183 ms
Average final reward: -5.93 (std=1.94).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.1, Strategy: epsilon, Backup: sparsebellman


[2017-07-23 04:19:51,105] Making new env: Reacher-v1
[2017-07-23 04:19:51,378] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.12162736205 (std = 0.0).


[2017-07-23 04:54:45,579] Making new env: Reacher-v1


Average episode duration: 208.960030 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.07128748036 (std = 0.0503398816913).


[2017-07-23 05:29:08,665] Making new env: Reacher-v1


Average episode duration: 205.857573 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -6.11445427592 (std = 0.0735944762722).
Average episode duration: 208.486424 ms
Average final reward: -6.14 (std=1.97).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.1, Strategy: epsilon, Backup: softbellman


[2017-07-23 06:04:08,535] Making new env: Reacher-v1
[2017-07-23 06:04:08,813] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.51646425842 (std = 0.0).


[2017-07-23 06:38:54,083] Making new env: Reacher-v1


Average episode duration: 207.955065 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.38274241969 (std = 0.133721838722).


[2017-07-23 07:12:53,299] Making new env: Reacher-v1


Average episode duration: 203.385057 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -6.2384779922 (std = 0.231398941619).
Average episode duration: 207.115903 ms
Average final reward: -6.25 (std=1.79).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.1, Strategy: sparsemax, Backup: bellman


[2017-07-23 07:47:39,217] Making new env: Reacher-v1
[2017-07-23 07:47:39,225] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.86539060125 (std = 0.0).


[2017-07-23 10:04:55,929] Making new env: Reacher-v1


Average episode duration: 823.215817 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.64066379086 (std = 0.22472681039).


[2017-07-23 12:21:17,012] Making new env: Reacher-v1


Average episode duration: 817.621486 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.80780007562 (std = 0.299227619446).
Average episode duration: 822.198909 ms
Average final reward: -5.83 (std=2.01).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.1, Strategy: sparsemax, Backup: sparsebellman


[2017-07-23 14:38:33,623] Making new env: Reacher-v1
[2017-07-23 14:38:33,630] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.49454650739 (std = 0.0).


[2017-07-23 16:55:18,423] Making new env: Reacher-v1


Average episode duration: 819.950423 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.55183771595 (std = 0.0572912085648).


[2017-07-23 19:12:17,196] Making new env: Reacher-v1


Average episode duration: 821.234254 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.52604901407 (std = 0.0593152819706).
Average episode duration: 817.379398 ms
Average final reward: -5.53 (std=1.79).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.1, Strategy: sparsemax, Backup: softbellman


[2017-07-23 21:28:46,185] Making new env: Reacher-v1
[2017-07-23 21:28:46,368] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -15.8733176513 (std = 0.0).


[2017-07-23 23:45:37,758] Making new env: Reacher-v1


Average episode duration: 820.650007 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -10.8450701473 (std = 5.028247504).


[2017-07-24 02:02:25,420] Making new env: Reacher-v1


Average episode duration: 820.293477 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -9.33652647935 (std = 4.62676164328).
Average episode duration: 815.925537 ms
Average final reward: -9.32 (std=2.71).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.1, Strategy: softmax, Backup: bellman


[2017-07-24 04:18:39,899] Making new env: Reacher-v1
[2017-07-24 04:18:40,310] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -24.9861067181 (std = 0.0).


[2017-07-24 04:53:22,711] Making new env: Reacher-v1


Average episode duration: 207.693519 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -22.8586837238 (std = 2.12742299422).


[2017-07-24 05:27:41,267] Making new env: Reacher-v1


Average episode duration: 205.384634 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -27.4588541452 (std = 6.73352964961).
Average episode duration: 207.359739 ms
Average final reward: -27.46 (std=2.49).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.1, Strategy: softmax, Backup: sparsebellman


[2017-07-24 06:02:29,456] Making new env: Reacher-v1
[2017-07-24 06:02:29,734] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -27.9374085332 (std = 0.0).


[2017-07-24 06:37:05,927] Making new env: Reacher-v1


Average episode duration: 207.101272 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -29.4274713763 (std = 1.49006284313).


[2017-07-24 07:11:26,171] Making new env: Reacher-v1


Average episode duration: 205.559507 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -25.5393924702 (std = 5.63156343101).
Average episode duration: 206.970997 ms
Average final reward: -25.54 (std=3.05).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 0.1, Strategy: softmax, Backup: softbellman


[2017-07-24 07:46:10,613] Making new env: Reacher-v1
[2017-07-24 07:46:10,891] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -48.1689749605 (std = 0.0).


[2017-07-24 08:20:57,827] Making new env: Reacher-v1


Average episode duration: 208.246543 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -36.4665767278 (std = 11.7023982327).


[2017-07-24 08:55:38,945] Making new env: Reacher-v1


Average episode duration: 207.651233 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -32.9173343984 (std = 10.7931301526).
Average episode duration: 211.242512 ms
Average final reward: -32.90 (std=2.91).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 1, Strategy: epsilon, Backup: bellman


[2017-07-24 09:31:06,304] Making new env: Reacher-v1
[2017-07-24 09:31:06,579] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.58308105017 (std = 0.0).


[2017-07-24 10:04:57,456] Making new env: Reacher-v1


Average episode duration: 202.604924 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.32704178529 (std = 0.256039264878).


[2017-07-24 10:38:50,228] Making new env: Reacher-v1


Average episode duration: 202.781838 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -6.44876909805 (std = 0.270812014957).
Average episode duration: 209.551344 ms
Average final reward: -6.47 (std=2.25).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 1, Strategy: epsilon, Backup: sparsebellman


[2017-07-24 11:14:00,980] Making new env: Reacher-v1
[2017-07-24 11:14:00,986] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.38516089824 (std = 0.0).


[2017-07-24 11:48:26,309] Making new env: Reacher-v1


Average episode duration: 206.082600 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.28454895273 (std = 0.100611945509).


[2017-07-24 12:23:15,349] Making new env: Reacher-v1


Average episode duration: 208.374080 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -6.19418020432 (std = 0.151926068962).
Average episode duration: 205.323047 ms
Average final reward: -6.20 (std=1.97).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 1, Strategy: epsilon, Backup: softbellman


[2017-07-24 12:57:43,388] Making new env: Reacher-v1
[2017-07-24 12:57:43,395] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.55228203836 (std = 0.0).


[2017-07-24 13:32:17,254] Making new env: Reacher-v1


Average episode duration: 206.906187 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.25162241198 (std = 0.300659626378).


[2017-07-24 14:07:05,207] Making new env: Reacher-v1


Average episode duration: 208.322218 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -6.07508486295 (std = 0.350135906551).
Average episode duration: 209.688792 ms
Average final reward: -6.09 (std=1.83).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 1, Strategy: sparsemax, Backup: bellman


[2017-07-24 14:42:19,717] Making new env: Reacher-v1
[2017-07-24 14:42:19,724] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -20.2734324029 (std = 0.0).


[2017-07-24 17:01:01,599] Making new env: Reacher-v1


Average episode duration: 831.705370 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -12.7322325255 (std = 7.54119987743).


[2017-07-24 19:20:02,494] Making new env: Reacher-v1


Average episode duration: 833.638339 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -10.1722599564 (std = 7.14283203664).
Average episode duration: 830.095530 ms
Average final reward: -10.16 (std=2.88).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 1, Strategy: sparsemax, Backup: sparsebellman


[2017-07-24 21:38:38,344] Making new env: Reacher-v1
[2017-07-24 21:38:38,352] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -15.9354960134 (std = 0.0).


[2017-07-24 23:56:58,341] Making new env: Reacher-v1


Average episode duration: 829.505883 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -10.5246332689 (std = 5.41086274453).


[2017-07-25 02:14:32,889] Making new env: Reacher-v1


Average episode duration: 824.962839 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.5435799258 (std = 4.64702001354).
Average episode duration: 829.651151 ms
Average final reward: -11.54 (std=2.12).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 1, Strategy: sparsemax, Backup: softbellman


[2017-07-25 04:33:04,473] Making new env: Reacher-v1
[2017-07-25 04:33:04,761] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.44630726521 (std = 0.0).


[2017-07-25 06:51:41,942] Making new env: Reacher-v1


Average episode duration: 831.143341 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -12.3059752844 (std = 6.85966801919).


[2017-07-25 09:09:00,320] Making new env: Reacher-v1


Average episode duration: 823.371594 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -9.97102462378 (std = 6.50184738343).
Average episode duration: 833.073163 ms
Average final reward: -9.98 (std=2.40).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 1, Strategy: softmax, Backup: bellman


[2017-07-25 11:28:05,297] Making new env: Reacher-v1
[2017-07-25 11:28:05,304] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -48.645752356 (std = 0.0).


[2017-07-25 12:02:28,673] Making new env: Reacher-v1


Average episode duration: 205.812110 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -43.2667659786 (std = 5.37898637746).


[2017-07-25 12:36:54,492] Making new env: Reacher-v1


Average episode duration: 206.124158 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -41.15162252 (std = 5.3138178365).
Average episode duration: 206.569877 ms
Average final reward: -41.22 (std=4.97).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 1, Strategy: softmax, Backup: sparsebellman


[2017-07-25 13:11:34,547] Making new env: Reacher-v1
[2017-07-25 13:11:34,554] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -34.0230344591 (std = 0.0).


[2017-07-25 13:46:25,987] Making new env: Reacher-v1


Average episode duration: 208.659825 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -39.5337589793 (std = 5.51072452021).


[2017-07-25 14:20:46,199] Making new env: Reacher-v1


Average episode duration: 205.560707 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -39.3804910192 (std = 4.50470553536).
Average episode duration: 208.106078 ms
Average final reward: -39.33 (std=4.76).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 2601, Temp: 1, Strategy: softmax, Backup: softbellman


[2017-07-25 14:55:43,028] Making new env: Reacher-v1
[2017-07-25 14:55:43,323] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -35.7258994092 (std = 0.0).


[2017-07-25 15:30:31,818] Making new env: Reacher-v1


Average episode duration: 208.329170 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -46.4427212174 (std = 10.7168218082).


[2017-07-25 16:05:07,400] Making new env: Reacher-v1


Average episode duration: 207.096997 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -44.1458326922 (std = 9.33371524319).
Average episode duration: 205.630737 ms
Average final reward: -44.11 (std=8.17).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.01, Strategy: epsilon, Backup: bellman


[2017-07-25 16:39:38,339] Making new env: Reacher-v1
[2017-07-25 16:39:38,344] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.81184967294 (std = 0.0).


[2017-07-25 17:00:24,754] Making new env: Reacher-v1


Average episode duration: 124.196095 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.2702240362 (std = 5.45837436323).


[2017-07-25 17:21:32,909] Making new env: Reacher-v1


Average episode duration: 126.373044 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -9.33666390548 (std = 5.22875479287).
Average episode duration: 124.038725 ms
Average final reward: -9.36 (std=1.82).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.01, Strategy: epsilon, Backup: sparsebellman


[2017-07-25 17:42:27,823] Making new env: Reacher-v1
[2017-07-25 17:42:28,116] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.23009688787 (std = 0.0).


[2017-07-25 18:03:36,728] Making new env: Reacher-v1


Average episode duration: 126.316524 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.85799533118 (std = 0.372101556694).


[2017-07-25 18:24:23,376] Making new env: Reacher-v1


Average episode duration: 124.222511 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.94223190292 (std = 0.326340281047).
Average episode duration: 124.530943 ms
Average final reward: -5.95 (std=1.48).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.01, Strategy: epsilon, Backup: softbellman


[2017-07-25 18:45:24,386] Making new env: Reacher-v1
[2017-07-25 18:45:24,392] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.89518081642 (std = 0.0).


[2017-07-25 19:06:14,745] Making new env: Reacher-v1


Average episode duration: 124.581107 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.83952396963 (std = 0.0556568467923).


[2017-07-25 19:27:10,969] Making new env: Reacher-v1


Average episode duration: 125.150285 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -6.11905462396 (std = 0.397919459791).
Average episode duration: 127.043450 ms
Average final reward: -6.15 (std=1.49).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.01, Strategy: sparsemax, Backup: bellman


[2017-07-25 19:48:36,447] Making new env: Reacher-v1
[2017-07-25 19:48:36,452] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.29710819051 (std = 0.0).


[2017-07-25 20:14:52,108] Making new env: Reacher-v1


Average episode duration: 157.128442 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.95475784267 (std = 0.342350347841).


[2017-07-25 20:40:58,956] Making new env: Reacher-v1


Average episode duration: 156.233441 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.85114238131 (std = 0.315607617316).
Average episode duration: 155.605160 ms
Average final reward: -5.87 (std=1.75).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.01, Strategy: sparsemax, Backup: sparsebellman


[2017-07-25 21:07:09,199] Making new env: Reacher-v1
[2017-07-25 21:07:09,204] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.31113757066 (std = 0.0).


[2017-07-25 21:33:07,008] Making new env: Reacher-v1


Average episode duration: 155.351503 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.94572670578 (std = 0.365410864888).


[2017-07-25 21:59:10,941] Making new env: Reacher-v1


Average episode duration: 155.931271 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -8.66964578125 (std = 3.86374002657).
Average episode duration: 155.108291 ms
Average final reward: -8.70 (std=2.47).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.01, Strategy: sparsemax, Backup: softbellman


[2017-07-25 22:25:17,189] Making new env: Reacher-v1
[2017-07-25 22:25:17,194] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -8.10133393866 (std = 0.0).


[2017-07-25 22:51:26,175] Making new env: Reacher-v1


Average episode duration: 156.459764 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -7.01756797175 (std = 1.08376596691).


[2017-07-25 23:17:24,584] Making new env: Reacher-v1


Average episode duration: 155.404819 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -6.43630177174 (std = 1.20779685291).
Average episode duration: 155.365185 ms
Average final reward: -6.44 (std=1.96).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.01, Strategy: softmax, Backup: bellman


[2017-07-25 23:43:32,721] Making new env: Reacher-v1
[2017-07-25 23:43:32,726] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.49224248578 (std = 0.0).


[2017-07-26 00:05:02,599] Making new env: Reacher-v1


Average episode duration: 128.551678 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.2691778356 (std = 0.223064650183).


[2017-07-26 00:26:16,479] Making new env: Reacher-v1


Average episode duration: 126.905896 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.31757241557 (std = 0.194566088638).
Average episode duration: 128.435706 ms
Average final reward: -5.35 (std=1.43).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.01, Strategy: softmax, Backup: sparsebellman


[2017-07-26 00:47:55,277] Making new env: Reacher-v1
[2017-07-26 00:47:55,282] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.62616172753 (std = 0.0).


[2017-07-26 01:09:01,466] Making new env: Reacher-v1


Average episode duration: 126.103508 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.36729551693 (std = 0.258866210602).


[2017-07-26 01:30:00,527] Making new env: Reacher-v1


Average episode duration: 125.488756 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.46863666567 (std = 0.255371363935).
Average episode duration: 127.140335 ms
Average final reward: -5.49 (std=1.38).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.01, Strategy: softmax, Backup: softbellman


[2017-07-26 01:51:26,530] Making new env: Reacher-v1
[2017-07-26 01:51:26,535] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.58552989001 (std = 0.0).


[2017-07-26 02:13:02,903] Making new env: Reacher-v1


Average episode duration: 129.150356 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.58472555343 (std = 0.000804336584769).


[2017-07-26 02:34:16,255] Making new env: Reacher-v1


Average episode duration: 126.884680 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.64572407164 (std = 0.0862674315853).
Average episode duration: 128.692171 ms
Average final reward: -5.65 (std=1.60).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.1, Strategy: epsilon, Backup: bellman


[2017-07-26 02:55:57,802] Making new env: Reacher-v1
[2017-07-26 02:55:57,807] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.75872574499 (std = 0.0).


[2017-07-26 03:16:56,241] Making new env: Reacher-v1


Average episode duration: 125.355014 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.57469416703 (std = 0.184031577957).


[2017-07-26 03:38:14,304] Making new env: Reacher-v1


Average episode duration: 127.369067 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -6.15855187136 (std = 0.839260419805).
Average episode duration: 124.191835 ms
Average final reward: -6.21 (std=1.76).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.1, Strategy: epsilon, Backup: sparsebellman


[2017-07-26 03:59:10,823] Making new env: Reacher-v1
[2017-07-26 03:59:10,828] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.86637204609 (std = 0.0).


[2017-07-26 04:20:23,555] Making new env: Reacher-v1


Average episode duration: 126.776858 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.80017186951 (std = 0.0662001765822).


[2017-07-26 04:41:10,340] Making new env: Reacher-v1


Average episode duration: 124.217653 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.86689847049 (std = 0.108749808326).
Average episode duration: 126.444870 ms
Average final reward: -5.86 (std=1.54).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.1, Strategy: epsilon, Backup: softbellman


[2017-07-26 05:02:29,491] Making new env: Reacher-v1
[2017-07-26 05:02:29,497] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.79369842178 (std = 0.0).


[2017-07-26 05:23:16,780] Making new env: Reacher-v1


Average episode duration: 124.257567 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.64483810605 (std = 0.148860315732).


[2017-07-26 05:44:31,679] Making new env: Reacher-v1


Average episode duration: 126.972614 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.73488521197 (std = 0.176039460458).
Average episode duration: 124.251048 ms
Average final reward: -5.74 (std=1.48).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.1, Strategy: sparsemax, Backup: bellman


[2017-07-26 06:05:28,942] Making new env: Reacher-v1
[2017-07-26 06:05:28,947] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.54798659846 (std = 0.0).


[2017-07-26 06:31:35,726] Making new env: Reacher-v1


Average episode duration: 156.185901 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.48258139979 (std = 0.0654051986765).


[2017-07-26 06:57:42,496] Making new env: Reacher-v1


Average episode duration: 156.203072 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.53399543897 (std = 0.0902147448714).
Average episode duration: 157.384462 ms
Average final reward: -5.54 (std=1.11).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.1, Strategy: sparsemax, Backup: sparsebellman


[2017-07-26 07:24:11,009] Making new env: Reacher-v1
[2017-07-26 07:24:11,014] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.64306529824 (std = 0.0).


[2017-07-26 07:50:12,844] Making new env: Reacher-v1


Average episode duration: 155.725737 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.62392216118 (std = 0.0191431370641).


[2017-07-26 08:16:13,044] Making new env: Reacher-v1


Average episode duration: 155.533697 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.73897828219 (std = 0.163462926773).
Average episode duration: 156.103276 ms
Average final reward: -5.74 (std=1.12).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.1, Strategy: sparsemax, Backup: softbellman


[2017-07-26 08:42:28,195] Making new env: Reacher-v1
[2017-07-26 08:42:28,200] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.73030494355 (std = 0.0).


[2017-07-26 09:08:30,843] Making new env: Reacher-v1


Average episode duration: 155.829704 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.70616761766 (std = 0.0241373258888).


[2017-07-26 09:34:30,843] Making new env: Reacher-v1


Average episode duration: 155.499950 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.80673416093 (std = 0.143581566536).
Average episode duration: 156.131296 ms
Average final reward: -5.82 (std=1.57).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.1, Strategy: softmax, Backup: bellman


[2017-07-26 10:00:47,123] Making new env: Reacher-v1
[2017-07-26 10:00:47,129] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -9.66533351422 (std = 0.0).


[2017-07-26 10:21:56,061] Making new env: Reacher-v1


Average episode duration: 126.417424 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -9.85128382105 (std = 0.185950306827).


[2017-07-26 10:43:22,497] Making new env: Reacher-v1


Average episode duration: 128.193133 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -9.95042906423 (std = 0.206666969375).
Average episode duration: 126.112275 ms
Average final reward: -9.95 (std=1.44).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.1, Strategy: softmax, Backup: sparsebellman


[2017-07-26 11:04:38,377] Making new env: Reacher-v1
[2017-07-26 11:04:38,383] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -26.8561812578 (std = 0.0).


[2017-07-26 11:25:39,783] Making new env: Reacher-v1


Average episode duration: 125.642666 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -24.3340180209 (std = 2.5221632369).


[2017-07-26 11:46:59,923] Making new env: Reacher-v1


Average episode duration: 127.570228 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -22.3657588311 (std = 3.46250777218).
Average episode duration: 126.250950 ms
Average final reward: -22.38 (std=3.61).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 0.1, Strategy: softmax, Backup: softbellman


[2017-07-26 12:08:17,863] Making new env: Reacher-v1
[2017-07-26 12:08:17,869] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -22.40046569 (std = 0.0).


[2017-07-26 12:29:42,307] Making new env: Reacher-v1


Average episode duration: 127.967167 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -21.3797614574 (std = 1.02070423268).


[2017-07-26 12:50:40,685] Making new env: Reacher-v1


Average episode duration: 125.368814 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -22.4324492884 (std = 1.70612468078).
Average episode duration: 126.541320 ms
Average final reward: -22.45 (std=2.74).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 1, Strategy: epsilon, Backup: bellman


[2017-07-26 13:12:00,958] Making new env: Reacher-v1
[2017-07-26 13:12:00,963] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.8983852644 (std = 0.0).


[2017-07-26 13:32:48,320] Making new env: Reacher-v1


Average episode duration: 124.285598 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.96649857905 (std = 1.06811331466).


[2017-07-26 13:53:43,062] Making new env: Reacher-v1


Average episode duration: 125.023863 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -6.51086258164 (std = 1.08433652198).
Average episode duration: 127.177537 ms
Average final reward: -6.50 (std=1.84).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 1, Strategy: epsilon, Backup: sparsebellman


[2017-07-26 14:15:10,105] Making new env: Reacher-v1
[2017-07-26 14:15:10,110] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -5.78868134003 (std = 0.0).


[2017-07-26 14:36:02,125] Making new env: Reacher-v1


Average episode duration: 124.757097 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -5.85582218114 (std = 0.0671408411049).


[2017-07-26 14:57:15,224] Making new env: Reacher-v1


Average episode duration: 126.803708 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -7.59902071584 (std = 2.46586445786).
Average episode duration: 131.795589 ms
Average final reward: -7.59 (std=1.91).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 1, Strategy: epsilon, Backup: softbellman


[2017-07-26 15:19:27,457] Making new env: Reacher-v1
[2017-07-26 15:19:27,462] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -6.07515351526 (std = 0.0).


[2017-07-26 15:40:23,446] Making new env: Reacher-v1


Average episode duration: 125.150689 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -6.00089921859 (std = 0.0742542966785).


[2017-07-26 16:01:43,000] Making new env: Reacher-v1


Average episode duration: 127.516013 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -5.90032270042 (std = 0.154619120409).
Average episode duration: 125.240830 ms
Average final reward: -5.91 (std=1.65).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 1, Strategy: sparsemax, Backup: bellman


[2017-07-26 16:22:49,958] Making new env: Reacher-v1
[2017-07-26 16:22:50,236] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -7.9217436963 (std = 0.0).


[2017-07-26 16:49:12,853] Making new env: Reacher-v1


Average episode duration: 157.779695 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -7.73030190066 (std = 0.19144179564).


[2017-07-26 17:15:33,657] Making new env: Reacher-v1


Average episode duration: 157.637419 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -7.66821418472 (std = 0.17928490275).
Average episode duration: 157.956158 ms
Average final reward: -7.67 (std=1.21).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 1, Strategy: sparsemax, Backup: sparsebellman


[2017-07-26 17:42:07,575] Making new env: Reacher-v1
[2017-07-26 17:42:07,593] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -7.68516514817 (std = 0.0).


[2017-07-26 18:08:28,808] Making new env: Reacher-v1


Average episode duration: 157.658696 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -7.9331615694 (std = 0.247996421235).


[2017-07-26 18:34:49,063] Making new env: Reacher-v1


Average episode duration: 157.552630 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -7.83304934165 (std = 0.247075696065).
Average episode duration: 158.699664 ms
Average final reward: -7.84 (std=1.14).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 1, Strategy: sparsemax, Backup: softbellman


[2017-07-26 19:01:31,062] Making new env: Reacher-v1
[2017-07-26 19:01:31,067] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -8.00264612319 (std = 0.0).


[2017-07-26 19:27:52,919] Making new env: Reacher-v1


Average episode duration: 157.736696 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -7.88049498141 (std = 0.122151141778).


[2017-07-26 19:54:18,593] Making new env: Reacher-v1


Average episode duration: 158.104054 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -10.5085438996 (std = 3.71796039521).
Average episode duration: 156.701414 ms
Average final reward: -10.53 (std=1.96).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 1, Strategy: softmax, Backup: bellman


[2017-07-26 20:20:40,620] Making new env: Reacher-v1
[2017-07-26 20:20:40,625] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -48.0429579665 (std = 0.0).


[2017-07-26 20:42:03,541] Making new env: Reacher-v1


Average episode duration: 127.783845 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -52.6824622532 (std = 4.63950428673).


[2017-07-26 21:03:06,417] Making new env: Reacher-v1


Average episode duration: 125.828873 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -51.8477294303 (std = 3.9678152413).
Average episode duration: 125.954182 ms
Average final reward: -51.72 (std=7.84).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 1, Strategy: softmax, Backup: sparsebellman


[2017-07-26 21:24:20,054] Making new env: Reacher-v1
[2017-07-26 21:24:20,071] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -45.6555189378 (std = 0.0).


[2017-07-26 21:45:48,526] Making new env: Reacher-v1


Average episode duration: 128.380133 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -42.1490614461 (std = 3.50645749168).


[2017-07-26 22:06:57,787] Making new env: Reacher-v1


Average episode duration: 126.419398 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -47.9737519259 (std = 8.72071486741).
Average episode duration: 128.108928 ms
Average final reward: -48.03 (std=5.54).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 121, Temp: 1, Strategy: softmax, Backup: softbellman


[2017-07-26 22:28:33,659] Making new env: Reacher-v1
[2017-07-26 22:28:33,665] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -44.7977017086 (std = 0.0).


[2017-07-26 22:49:33,761] Making new env: Reacher-v1


Average episode duration: 125.539621 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -41.9137399065 (std = 2.88396180216).


[2017-07-26 23:10:34,126] Making new env: Reacher-v1


Average episode duration: 125.609120 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -43.1159015155 (std = 2.90434310172).
Average episode duration: 128.006789 ms
Average final reward: -43.12 (std=2.35).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.01, Strategy: epsilon, Backup: bellman


[2017-07-26 23:32:09,245] Making new env: Reacher-v1
[2017-07-26 23:32:09,250] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -11.0695521163 (std = 0.0).


[2017-07-26 23:52:33,603] Making new env: Reacher-v1


Average episode duration: 121.957976 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -10.6615469859 (std = 0.408005130377).


[2017-07-27 00:13:54,058] Making new env: Reacher-v1


Average episode duration: 127.521699 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -10.5768718454 (std = 0.354003601357).
Average episode duration: 121.374211 ms
Average final reward: -10.57 (std=2.54).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.01, Strategy: epsilon, Backup: sparsebellman


[2017-07-27 00:34:22,345] Making new env: Reacher-v1
[2017-07-27 00:34:22,351] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -11.552357898 (std = 0.0).


[2017-07-27 00:55:02,404] Making new env: Reacher-v1


Average episode duration: 123.539381 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.0768147605 (std = 0.475543137557).


[2017-07-27 01:15:27,807] Making new env: Reacher-v1


Average episode duration: 122.091416 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -10.9146274915 (std = 0.450965930979).
Average episode duration: 122.213079 ms
Average final reward: -10.93 (std=2.74).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.01, Strategy: epsilon, Backup: softbellman


[2017-07-27 01:36:05,235] Making new env: Reacher-v1
[2017-07-27 01:36:05,505] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -10.8440013499 (std = 0.0).


[2017-07-27 01:56:24,863] Making new env: Reacher-v1


Average episode duration: 121.488569 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.0766182593 (std = 0.232616909466).


[2017-07-27 02:16:56,442] Making new env: Reacher-v1


Average episode duration: 122.766349 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -10.7964676464 (std = 0.439366001053).
Average episode duration: 117.313551 ms
Average final reward: -10.84 (std=2.63).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.01, Strategy: sparsemax, Backup: bellman


[2017-07-27 02:36:42,921] Making new env: Reacher-v1
[2017-07-27 02:36:42,926] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -12.0505340529 (std = 0.0).


[2017-07-27 02:56:56,646] Making new env: Reacher-v1


Average episode duration: 120.979460 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.8341509598 (std = 0.216383093189).


[2017-07-27 03:17:16,445] Making new env: Reacher-v1


Average episode duration: 121.588383 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.5252211638 (std = 0.471263903018).
Average episode duration: 121.335267 ms
Average final reward: -11.56 (std=2.90).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.01, Strategy: sparsemax, Backup: sparsebellman


[2017-07-27 03:37:45,944] Making new env: Reacher-v1
[2017-07-27 03:37:45,949] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -12.0082637768 (std = 0.0).


[2017-07-27 03:57:47,699] Making new env: Reacher-v1


Average episode duration: 119.786240 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.5714326065 (std = 0.436831170264).


[2017-07-27 04:17:55,389] Making new env: Reacher-v1


Average episode duration: 120.379107 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.29192581 (std = 0.532411882683).
Average episode duration: 121.388452 ms
Average final reward: -11.30 (std=2.91).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.01, Strategy: sparsemax, Backup: softbellman


[2017-07-27 04:38:22,691] Making new env: Reacher-v1
[2017-07-27 04:38:22,696] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -11.303233073 (std = 0.0).


[2017-07-27 04:58:33,717] Making new env: Reacher-v1


Average episode duration: 120.709768 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.2783858393 (std = 0.0248472337379).


[2017-07-27 05:18:47,083] Making new env: Reacher-v1


Average episode duration: 120.947071 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.1402199262 (std = 0.196446504325).
Average episode duration: 120.152018 ms
Average final reward: -11.18 (std=2.79).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.01, Strategy: softmax, Backup: bellman


[2017-07-27 05:39:02,047] Making new env: Reacher-v1
[2017-07-27 05:39:02,052] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -11.4197020377 (std = 0.0).


[2017-07-27 05:58:56,053] Making new env: Reacher-v1


Average episode duration: 119.018715 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.3791862331 (std = 0.0405158046036).


[2017-07-27 06:18:49,489] Making new env: Reacher-v1


Average episode duration: 118.946250 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.2672289337 (std = 0.161750509669).
Average episode duration: 118.845489 ms
Average final reward: -11.28 (std=2.96).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.01, Strategy: softmax, Backup: sparsebellman


[2017-07-27 06:38:51,931] Making new env: Reacher-v1
[2017-07-27 06:38:51,936] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -11.2052803353 (std = 0.0).


[2017-07-27 06:58:39,424] Making new env: Reacher-v1


Average episode duration: 118.357001 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.1933202185 (std = 0.011960116807).


[2017-07-27 07:18:28,723] Making new env: Reacher-v1


Average episode duration: 118.537088 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.0740432301 (std = 0.16896556702).
Average episode duration: 118.430526 ms
Average final reward: -11.11 (std=2.83).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.01, Strategy: softmax, Backup: softbellman


[2017-07-27 07:38:26,455] Making new env: Reacher-v1
[2017-07-27 07:38:26,459] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -12.2822606431 (std = 0.0).


[2017-07-27 07:58:19,012] Making new env: Reacher-v1


Average episode duration: 118.865395 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.8035073352 (std = 0.478753307963).


[2017-07-27 08:18:02,176] Making new env: Reacher-v1


Average episode duration: 117.923042 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.5875739778 (std = 0.496041916444).
Average episode duration: 117.003528 ms
Average final reward: -11.62 (std=3.37).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.1, Strategy: epsilon, Backup: bellman


[2017-07-27 08:37:45,767] Making new env: Reacher-v1
[2017-07-27 08:37:45,772] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -11.4438775959 (std = 0.0).


[2017-07-27 08:57:34,641] Making new env: Reacher-v1


Average episode duration: 118.494814 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.2427457211 (std = 0.201131874787).


[2017-07-27 09:17:12,951] Making new env: Reacher-v1


Average episode duration: 117.438994 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.0407642948 (std = 0.329488007601).
Average episode duration: 116.271792 ms
Average final reward: -11.05 (std=3.01).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.1, Strategy: epsilon, Backup: sparsebellman


[2017-07-27 09:36:49,116] Making new env: Reacher-v1
[2017-07-27 09:36:49,121] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -11.8310316708 (std = 0.0).


[2017-07-27 09:56:15,578] Making new env: Reacher-v1


Average episode duration: 116.221704 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.2558200938 (std = 0.575211576951).


[2017-07-27 10:15:53,421] Making new env: Reacher-v1


Average episode duration: 117.385938 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.1977631969 (std = 0.476780989622).
Average episode duration: 117.150047 ms
Average final reward: -11.21 (std=2.75).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.1, Strategy: epsilon, Backup: softbellman


[2017-07-27 10:35:38,500] Making new env: Reacher-v1
[2017-07-27 10:35:38,505] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -11.2217138624 (std = 0.0).


[2017-07-27 10:55:11,705] Making new env: Reacher-v1


Average episode duration: 116.898410 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.4376164349 (std = 0.215902572516).


[2017-07-27 11:14:47,427] Making new env: Reacher-v1


Average episode duration: 117.178552 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.2313476336 (std = 0.340836594237).
Average episode duration: 118.169723 ms
Average final reward: -11.27 (std=2.95).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.1, Strategy: sparsemax, Backup: bellman


[2017-07-27 11:34:43,319] Making new env: Reacher-v1
[2017-07-27 11:34:43,324] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -11.0265458667 (std = 0.0).


[2017-07-27 11:55:05,920] Making new env: Reacher-v1


Average episode duration: 121.867474 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -10.6318781633 (std = 0.394667703481).


[2017-07-27 12:15:23,288] Making new env: Reacher-v1


Average episode duration: 121.339394 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -10.7470492068 (std = 0.361068510538).
Average episode duration: 120.436366 ms
Average final reward: -10.79 (std=2.80).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.1, Strategy: sparsemax, Backup: sparsebellman


[2017-07-27 12:35:40,994] Making new env: Reacher-v1
[2017-07-27 12:35:41,271] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -11.3834149012 (std = 0.0).


[2017-07-27 12:55:54,099] Making new env: Reacher-v1


Average episode duration: 120.890286 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.281776463 (std = 0.101638438166).


[2017-07-27 13:16:03,789] Making new env: Reacher-v1


Average episode duration: 120.579833 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.2695012887 (std = 0.0847836927111).
Average episode duration: 121.031107 ms
Average final reward: -11.30 (std=2.72).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.1, Strategy: sparsemax, Backup: softbellman


[2017-07-27 13:36:27,666] Making new env: Reacher-v1
[2017-07-27 13:36:27,671] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -10.6878090611 (std = 0.0).


[2017-07-27 13:56:49,051] Making new env: Reacher-v1


Average episode duration: 121.745413 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -10.3903260823 (std = 0.297482978828).


[2017-07-27 14:17:20,725] Making new env: Reacher-v1


Average episode duration: 122.773189 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -10.3782934199 (std = 0.243489188785).
Average episode duration: 121.617441 ms
Average final reward: -10.42 (std=2.71).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.1, Strategy: softmax, Backup: bellman


[2017-07-27 14:37:50,338] Making new env: Reacher-v1
[2017-07-27 14:37:50,344] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -7.1804698629 (std = 0.0).


[2017-07-27 14:57:46,082] Making new env: Reacher-v1


Average episode duration: 119.171083 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -7.01106553546 (std = 0.16940432744).


[2017-07-27 15:17:50,677] Making new env: Reacher-v1


Average episode duration: 120.060839 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -7.09564917829 (std = 0.182867901428).
Average episode duration: 117.778144 ms
Average final reward: -7.10 (std=1.48).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.1, Strategy: softmax, Backup: sparsebellman


[2017-07-27 15:37:42,050] Making new env: Reacher-v1
[2017-07-27 15:37:42,055] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -7.19665052441 (std = 0.0).


[2017-07-27 15:57:41,982] Making new env: Reacher-v1


Average episode duration: 119.594947 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -7.27726483833 (std = 0.0806143139134).


[2017-07-27 16:23:40,330] Making new env: Reacher-v1


Average episode duration: 155.294490 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -7.36593465084 (std = 0.141623149141).
Average episode duration: 155.557242 ms
Average final reward: -7.38 (std=1.46).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 0.1, Strategy: softmax, Backup: softbellman


[2017-07-27 16:49:53,199] Making new env: Reacher-v1
[2017-07-27 16:49:53,204] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -7.51787289961 (std = 0.0).


[2017-07-27 17:16:06,753] Making new env: Reacher-v1


Average episode duration: 156.847393 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -7.42509777383 (std = 0.0927751257894).


[2017-07-27 17:41:17,996] Making new env: Reacher-v1


Average episode duration: 150.598214 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -7.47346380979 (std = 0.102062217204).
Average episode duration: 156.484648 ms
Average final reward: -7.49 (std=1.50).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 1, Strategy: epsilon, Backup: bellman


[2017-07-27 18:07:40,352] Making new env: Reacher-v1
[2017-07-27 18:07:40,358] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -10.5599955278 (std = 0.0).


[2017-07-27 18:33:35,336] Making new env: Reacher-v1


Average episode duration: 154.959885 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -10.8004300613 (std = 0.240434533545).


[2017-07-27 18:59:18,581] Making new env: Reacher-v1


Average episode duration: 153.811474 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -10.7131119126 (std = 0.231922605211).
Average episode duration: 154.883082 ms
Average final reward: -10.75 (std=2.73).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 1, Strategy: epsilon, Backup: sparsebellman


[2017-07-27 19:25:24,810] Making new env: Reacher-v1
[2017-07-27 19:25:24,816] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -11.6179872569 (std = 0.0).


[2017-07-27 19:47:41,436] Making new env: Reacher-v1


Average episode duration: 133.229717 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -11.5251097202 (std = 0.0928775366847).


[2017-07-27 20:08:58,726] Making new env: Reacher-v1


Average episode duration: 127.270945 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -11.2081983747 (std = 0.454550796205).
Average episode duration: 118.825312 ms
Average final reward: -11.22 (std=2.66).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 1, Strategy: epsilon, Backup: softbellman


[2017-07-27 20:29:00,669] Making new env: Reacher-v1
[2017-07-27 20:29:00,674] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -10.5466394687 (std = 0.0).


[2017-07-27 20:48:37,353] Making new env: Reacher-v1


Average episode duration: 117.278328 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -10.4871761311 (std = 0.0594633376146).


[2017-07-27 21:08:26,511] Making new env: Reacher-v1


Average episode duration: 118.522498 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -10.4896540628 (std = 0.048677913929).
Average episode duration: 125.749059 ms
Average final reward: -10.46 (std=2.44).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 1, Strategy: sparsemax, Backup: bellman


[2017-07-27 21:29:41,886] Making new env: Reacher-v1
[2017-07-27 21:29:41,892] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -7.63641860328 (std = 0.0).


[2017-07-27 21:56:34,227] Making new env: Reacher-v1


Average episode duration: 160.704366 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -7.61430845975 (std = 0.0221101435351).


[2017-07-27 22:23:22,491] Making new env: Reacher-v1


Average episode duration: 160.313012 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -7.6266285421 (std = 0.0250893301947).
Average episode duration: 160.468657 ms
Average final reward: -7.65 (std=1.56).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 1, Strategy: sparsemax, Backup: sparsebellman


[2017-07-27 22:50:25,028] Making new env: Reacher-v1
[2017-07-27 22:50:25,035] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -7.53657519319 (std = 0.0).


[2017-07-27 23:13:27,548] Making new env: Reacher-v1


Average episode duration: 137.818489 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -7.47878163515 (std = 0.0577935580424).


[2017-07-27 23:33:46,751] Making new env: Reacher-v1


Average episode duration: 121.528424 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -7.64925064122 (std = 0.245654420466).
Average episode duration: 121.838450 ms
Average final reward: -7.67 (std=1.56).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 1, Strategy: sparsemax, Backup: softbellman


[2017-07-27 23:54:18,833] Making new env: Reacher-v1
[2017-07-27 23:54:18,839] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -7.32916286391 (std = 0.0).


[2017-07-28 00:14:43,208] Making new env: Reacher-v1


Average episode duration: 122.052488 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -7.50767558151 (std = 0.178512717605).


[2017-07-28 00:34:55,408] Making new env: Reacher-v1


Average episode duration: 120.823454 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -7.56741761747 (std = 0.168471803629).
Average episode duration: 120.450728 ms
Average final reward: -7.59 (std=1.60).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 1, Strategy: softmax, Backup: bellman


[2017-07-28 00:55:13,395] Making new env: Reacher-v1
[2017-07-28 00:55:13,400] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -50.3138774277 (std = 0.0).


[2017-07-28 01:15:08,358] Making new env: Reacher-v1


Average episode duration: 119.100883 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -51.6206812469 (std = 1.30680381918).


[2017-07-28 01:34:55,519] Making new env: Reacher-v1


Average episode duration: 118.327086 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -51.3498450506 (std = 1.13366454696).
Average episode duration: 119.290622 ms
Average final reward: -51.36 (std=3.80).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 1, Strategy: softmax, Backup: sparsebellman


[2017-07-28 01:55:02,339] Making new env: Reacher-v1
[2017-07-28 01:55:02,344] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -51.6584503769 (std = 0.0).


[2017-07-28 02:14:54,267] Making new env: Reacher-v1


Average episode duration: 118.802431 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -51.7691717475 (std = 0.110721370658).


[2017-07-28 02:34:52,884] Making new env: Reacher-v1


Average episode duration: 119.477298 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -52.0161903378 (std = 0.360845094853).
Average episode duration: 118.694101 ms
Average final reward: -51.97 (std=3.58).

The 100-episode moving average reached -3.5 after 0 episodes.
Problem: Reacher-v1, Actions: 9, Temp: 1, Strategy: softmax, Backup: softbellman


[2017-07-28 02:54:53,161] Making new env: Reacher-v1
[2017-07-28 02:54:53,165] Making new env: Reacher-v1



EXECUTING EXPERIMENT 0 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 1 experiment: -52.0880583806 (std = 0.0).


[2017-07-28 03:14:46,582] Making new env: Reacher-v1


Average episode duration: 118.948146 ms

EXECUTING EXPERIMENT 1 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 2 experiments: -52.2711900591 (std = 0.183131678407).


[2017-07-28 03:34:41,813] Making new env: Reacher-v1


Average episode duration: 119.129424 ms

EXECUTING EXPERIMENT 2 OF 3 IN ENVIRONMENT Reacher-v1.
Final mean reward, averaged over 3 experiments: -52.2286458816 (std = 0.161177402756).
Average episode duration: 118.827409 ms
Average final reward: -52.19 (std=3.95).

The 100-episode moving average reached -3.5 after 0 episodes.
Reacher-v1 is finished and is saved
