# Sample Workflow for d3rlpy Experiments

In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import math
import subprocess
import os
import d3rlpy
plt.style.use('matplotlibrc')

from Python.data_sampler import *

## Building an MDPDataset

We first read in a large batch of samples from the file. As `d3rlpy` wants it in the form (observations, actions, rewards, terminal flags), we go ahead and do that. Here's a helper function to get a dataset from a list of chunks of your choosing.

In [163]:
def get_dataset(chunks : list, batch_size=30000, 
                path="collected_data/rl_det_small.txt") -> d3rlpy.dataset.MDPDataset :
    random.seed(0)
    samples = DataSampler(path_to_data=path)
    states = []
    actions = []
    rewards = []
    next_states = []
    for chunk in chunks:
        samples.use_chunk(chunk)
        samples.read_chunk()
        [statesChunk, actionsChunk, rewardsChunk, nextStatesChunk] = samples.get_batch(batch_size)
        states.append(statesChunk)
        actions.append(actionsChunk)
        rewards.append(rewardsChunk)
        next_states.append(nextStatesChunk)
    states = torch.cat(states)
    actions = torch.cat(actions)
    rewards = torch.cat(rewards)
    next_states = torch.cat(next_states)
    terminals = np.zeros(len(states))
    terminals[::100] = 1 #episode length 100, change if necessary
    print(states.shape)
    dataset = d3rlpy.dataset.MDPDataset(states.numpy(), 
                                        actions.numpy(), 
                                        rewards.numpy(), terminals)
    return dataset

We can build the dataset from there, just like this, and split into train and test sets.

In [164]:
dataset = get_dataset([3,5,7,9], path="collected_data/rl_det_small.txt")

[ 0.00000000e+00  7.95731469e+08 -8.17891077e-02 -1.19999531e-03
  7.39998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  2.09713430e-01 -2.63658359e-01  6.00000000e-01]
Read chunk # 4 out of 10000
[ 0.00000000e+00  7.95731469e+08  1.24610892e-01  2.40000469e-03
 -7.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.20016566e-01  3.79282423e-01 -6.00000000e-01]
Read chunk # 6 out of 10000
[ 0.00000000e+00  7.95731469e+08 -9.01891077e-02  1.08000047e-02
  3.99986580e-04  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -3.17973088e-02 -2.40776052e-01  6.00000000e-01]
Read chunk # 8 out of 10000
[ 0.00000000e+00  7.95731469e+08  6.91108923e-02 -5.99999531e-03
 -6.00001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -1.42355434e-01  2.22081792e-01 -6.00000000e-01]
Read chunk # 10 out of 10000
torch.Size([111080, 6])


In [165]:
print("The behavior policy value statistics are:")
dataset.compute_stats()['return']

The behavior policy value statistics are:


{'mean': -4.1227446,
 'std': 2.4676569,
 'min': -12.578855,
 'max': 0.0,
 'histogram': (array([ 26,   9,   7,   7,   8,   7,  10,  13,  27,  54,  56,  73, 109,
          84, 186, 148, 124,  83,  67,  13]),
  array([-12.578855 , -11.949912 , -11.320969 , -10.692026 , -10.063084 ,
          -9.434141 ,  -8.805199 ,  -8.176255 ,  -7.5473127,  -6.9183702,
          -6.2894273,  -5.6604843,  -5.031542 ,  -4.4025993,  -3.7736564,
          -3.1447136,  -2.515771 ,  -1.8868282,  -1.2578855,  -0.6289427,
           0.       ], dtype=float32))}

In [166]:
from sklearn.model_selection import train_test_split
train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

## Setting up an Algorithm

In [167]:
from d3rlpy.algos import CQL
model = CQL(q_func_factory='mean', #qr -> quantile regression q function, but you don't have to use this
            reward_scaler='standard',
          actor_learning_rate=1e-5, 
          critic_learning_rate=0.0003, 
            use_gpu=False) #change it to true if you have one
model.build_with_dataset(dataset)

In [168]:
from d3rlpy.metrics.scorer import td_error_scorer
from d3rlpy.metrics.scorer import average_value_estimation_scorer
from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer

# calculate metrics with test dataset
ave_error_init = average_value_estimation_scorer(model, test_episodes)
print(ave_error_init)

0.005242250281842394


In [150]:
%load_ext tensorboard
%tensorboard --logdir runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 77774), started 0:01:16 ago. (Use '!kill 77774' to kill it.)

In [169]:
model.fit(train_episodes,
        eval_episodes=test_episodes,
        n_epochs=5, 
        tensorboard_dir='runs',
        scorers={
            'td_error': td_error_scorer,
            'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer
        })

2022-04-06 21:00.09 [debug    ] RoundIterator is selected.
2022-04-06 21:00.09 [info     ] Directory is created at d3rlpy_logs/CQL_20220406210009
2022-04-06 21:00.09 [debug    ] Fitting reward scaler...       reward_scaler=standard
2022-04-06 21:00.09 [info     ] Parameters are saved to d3rlpy_logs/CQL_20220406210009/params.json params={'action_scaler': None, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'actor_learning_rate': 1e-05, 'actor_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'alpha_learning_rate': 0.0001, 'alpha_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'alpha_threshold': 10.0, 'batch_size': 256, 'conservative_weight': 5.0, 'critic_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'critic_le

Epoch 1/5:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-06 21:00.33 [info     ] CQL_20220406210009: epoch=1 step=343 epoch=1 metrics={'time_sample_batch': 0.0003157273673455152, 'time_algorithm_update': 0.06868734651682329, 'temp_loss': 4.5058160120127155, 'temp': 0.9832170249421812, 'alpha_loss': -7.1547618615807576, 'alpha': 1.0126195798462403, 'critic_loss': 8.009437349725395, 'actor_loss': 0.6563751490965877, 'time_step': 0.06907681195450942, 'td_error': 0.998184895395473, 'init_value': 0.32236340641975403, 'ave_value': 0.31981140022080984} step=343
2022-04-06 21:00.33 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220406210009/model_343.pt


Epoch 2/5:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-06 21:00.58 [info     ] CQL_20220406210009: epoch=2 step=686 epoch=2 metrics={'time_sample_batch': 0.0003133056462679938, 'time_algorithm_update': 0.06994466392361388, 'temp_loss': 1.9120248182521964, 'temp': 0.9586843066243319, 'alpha_loss': 7.930571129301199, 'alpha': 1.0036906923218998, 'critic_loss': -7.2960928539492995, 'actor_loss': 2.731188000464926, 'time_step': 0.07033466389158377, 'td_error': 1.5712441166613447, 'init_value': 0.6769112944602966, 'ave_value': 0.6663637833781404} step=686
2022-04-06 21:00.58 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220406210009/model_686.pt


Epoch 3/5:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-06 21:01.25 [info     ] CQL_20220406210009: epoch=3 step=1029 epoch=3 metrics={'time_sample_batch': 0.0003292922028299671, 'time_algorithm_update': 0.07761665752955846, 'temp_loss': 0.43796961225905934, 'temp': 0.9482044304772647, 'alpha_loss': 13.75975546544912, 'alpha': 0.9639594004036038, 'critic_loss': -12.833585374904443, 'actor_loss': 3.5213696525673823, 'time_step': 0.07802717609238694, 'td_error': 2.651116823202267, 'init_value': 0.4121234118938446, 'ave_value': 0.3942178735878973} step=1029
2022-04-06 21:01.25 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220406210009/model_1029.pt


Epoch 4/5:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-06 21:01.52 [info     ] CQL_20220406210009: epoch=4 step=1372 epoch=4 metrics={'time_sample_batch': 0.00032186994747239716, 'time_algorithm_update': 0.07587018096412236, 'temp_loss': -0.3027828116290293, 'temp': 0.9479728283757024, 'alpha_loss': 16.860179915025004, 'alpha': 0.9206059844431307, 'critic_loss': -15.363678295480266, 'actor_loss': 4.954739303004985, 'time_step': 0.07627053163489517, 'td_error': 4.112356028059688, 'init_value': -1.096514344215393, 'ave_value': -1.1222196783244935} step=1372
2022-04-06 21:01.52 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220406210009/model_1372.pt


Epoch 5/5:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-06 21:02.18 [info     ] CQL_20220406210009: epoch=5 step=1715 epoch=5 metrics={'time_sample_batch': 0.00032411580878166, 'time_algorithm_update': 0.0754263831992191, 'temp_loss': -0.8014757160542658, 'temp': 0.9576302907557251, 'alpha_loss': 18.4448027541269, 'alpha': 0.8810043835431425, 'critic_loss': -15.968879897114835, 'actor_loss': 6.788558832062924, 'time_step': 0.0758308836044445, 'td_error': 5.324599808256394, 'init_value': -2.0982847213745117, 'ave_value': -2.1331112435323885} step=1715
2022-04-06 21:02.18 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220406210009/model_1715.pt


[(1,
  {'time_sample_batch': 0.0003157273673455152,
   'time_algorithm_update': 0.06868734651682329,
   'temp_loss': 4.5058160120127155,
   'temp': 0.9832170249421812,
   'alpha_loss': -7.1547618615807576,
   'alpha': 1.0126195798462403,
   'critic_loss': 8.009437349725395,
   'actor_loss': 0.6563751490965877,
   'time_step': 0.06907681195450942,
   'td_error': 0.998184895395473,
   'init_value': 0.32236340641975403,
   'ave_value': 0.31981140022080984}),
 (2,
  {'time_sample_batch': 0.0003133056462679938,
   'time_algorithm_update': 0.06994466392361388,
   'temp_loss': 1.9120248182521964,
   'temp': 0.9586843066243319,
   'alpha_loss': 7.930571129301199,
   'alpha': 1.0036906923218998,
   'critic_loss': -7.2960928539492995,
   'actor_loss': 2.731188000464926,
   'time_step': 0.07033466389158377,
   'td_error': 1.5712441166613447,
   'init_value': 0.6769112944602966,
   'ave_value': 0.6663637833781404}),
 (3,
  {'time_sample_batch': 0.0003292922028299671,
   'time_algorithm_update': 0.

## Off-Policy Evaluation

We do get some metrics on a test set of initial state value and average value. However, these estimates (using the critic's Q-function) of model performance are biased. They're useful for validation during training, but not much else. Instead, we fit a Q-function to the data (or a separate dataset, as I've done here) separately and evaluate the model's performance on it.

Feel free to change the chunks and number of steps.

In [171]:
from d3rlpy.ope import FQE
# metrics to evaluate with
from d3rlpy.metrics.scorer import soft_opc_scorer


ope_dataset = get_dataset([2,4,6,8], path="collected_data/rl_det_small.txt") #change if you'd prefer different chunks
ope_train_episodes, ope_test_episodes = train_test_split(ope_dataset, test_size=0.2)

fqe = FQE(algo=model, use_gpu=False) #change this if you have one!
fqe.fit(ope_train_episodes, eval_episodes=ope_test_episodes,
        tensorboard_dir='runs',
        n_epochs=50, n_steps_per_epoch=1000, #change if overfitting/underfitting
        scorers={
           'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer,
           'soft_opc': soft_opc_scorer(return_threshold=0)
        })

[ 0.00000000e+00  7.95731469e+08 -1.03891077e-02 -1.41999953e-02
 -2.10001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -1.78778459e-03 -1.34615461e-02  4.84073546e-02]
Read chunk # 3 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.24891077e-02 -1.35999953e-02
 -4.20001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -6.23311010e-02 -1.64283998e-01  6.00000000e-01]
Read chunk # 5 out of 10000
[ 0.00000000e+00  7.95731469e+08  7.01089229e-03 -4.19999531e-03
  7.39998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  2.21623335e-01 -2.86362315e-02 -8.00043364e-02]
Read chunk # 7 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.03989108e-01 -1.37999953e-02
  7.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  2.76352555e-01 -3.26280816e-01  6.00000000e-01]
Read chunk # 9 out of 10000
torch.Size([111080, 6])
2022-04-06 21:04.24 [debug    ] RoundIterator is selected.
2022-04-06 21:04.24 [info     ] Directory is created at d3rlpy_logs/FQE_2022040621042

Epoch 1/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.26 [info     ] FQE_20220406210424: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.00013119403213466218, 'time_algorithm_update': 0.0017099345192007702, 'loss': 0.0007221336903229773, 'time_step': 0.001900675084976509, 'init_value': -0.3511500060558319, 'ave_value': -0.3517933719735239, 'soft_opc': nan} step=878




2022-04-06 21:04.26 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_878.pt


Epoch 2/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.28 [info     ] FQE_20220406210424: epoch=2 step=1756 epoch=2 metrics={'time_sample_batch': 0.00013680067040654142, 'time_algorithm_update': 0.0018210438226513004, 'loss': 0.0021344184700107043, 'time_step': 0.0020162573164850812, 'init_value': -0.571056067943573, 'ave_value': -0.5718733890903176, 'soft_opc': nan} step=1756




2022-04-06 21:04.28 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_1756.pt


Epoch 3/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.30 [info     ] FQE_20220406210424: epoch=3 step=2634 epoch=3 metrics={'time_sample_batch': 0.00013056404226733233, 'time_algorithm_update': 0.0017468429643635326, 'loss': 0.004262366834274979, 'time_step': 0.0019355219426079057, 'init_value': -0.7475205659866333, 'ave_value': -0.7482810573862553, 'soft_opc': nan} step=2634




2022-04-06 21:04.30 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_2634.pt


Epoch 4/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.32 [info     ] FQE_20220406210424: epoch=4 step=3512 epoch=4 metrics={'time_sample_batch': 0.0001283913918671141, 'time_algorithm_update': 0.0017910381112935059, 'loss': 0.006516197041351099, 'time_step': 0.0019766616386813294, 'init_value': -0.8437454700469971, 'ave_value': -0.8445460495133652, 'soft_opc': nan} step=3512




2022-04-06 21:04.32 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_3512.pt


Epoch 5/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.34 [info     ] FQE_20220406210424: epoch=5 step=4390 epoch=5 metrics={'time_sample_batch': 0.00013289717715530568, 'time_algorithm_update': 0.0017589827604880366, 'loss': 0.008994538742276776, 'time_step': 0.001953979016436531, 'init_value': -0.9731312990188599, 'ave_value': -0.9740022924311185, 'soft_opc': nan} step=4390




2022-04-06 21:04.34 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_4390.pt


Epoch 6/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.36 [info     ] FQE_20220406210424: epoch=6 step=5268 epoch=6 metrics={'time_sample_batch': 0.00012393285581897224, 'time_algorithm_update': 0.0016291079054117745, 'loss': 0.009836238872640127, 'time_step': 0.001808185240673856, 'init_value': -1.036438226699829, 'ave_value': -1.03704827191276, 'soft_opc': nan} step=5268




2022-04-06 21:04.36 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_5268.pt


Epoch 7/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.37 [info     ] FQE_20220406210424: epoch=7 step=6146 epoch=7 metrics={'time_sample_batch': 0.00012213575541294245, 'time_algorithm_update': 0.001618726106873949, 'loss': 0.012670549119848934, 'time_step': 0.0017904312029514877, 'init_value': -1.2273365259170532, 'ave_value': -1.228170389840637, 'soft_opc': nan} step=6146




2022-04-06 21:04.37 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_6146.pt


Epoch 8/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.39 [info     ] FQE_20220406210424: epoch=8 step=7024 epoch=8 metrics={'time_sample_batch': 0.00012230330013196942, 'time_algorithm_update': 0.0016036364918146156, 'loss': 0.016057082347277728, 'time_step': 0.0017788564968760844, 'init_value': -1.256137728691101, 'ave_value': -1.2566675287839073, 'soft_opc': nan} step=7024




2022-04-06 21:04.39 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_7024.pt


Epoch 9/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.41 [info     ] FQE_20220406210424: epoch=9 step=7902 epoch=9 metrics={'time_sample_batch': 0.00012487756907261042, 'time_algorithm_update': 0.0018262214160997396, 'loss': 0.018320463527612893, 'time_step': 0.0020039306957792312, 'init_value': -1.395872712135315, 'ave_value': -1.3966702248445697, 'soft_opc': nan} step=7902




2022-04-06 21:04.41 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_7902.pt


Epoch 10/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.43 [info     ] FQE_20220406210424: epoch=10 step=8780 epoch=10 metrics={'time_sample_batch': 0.00012048963533718657, 'time_algorithm_update': 0.0015954770368432672, 'loss': 0.021919277143855557, 'time_step': 0.0017659493079218071, 'init_value': -1.4761677980422974, 'ave_value': -1.4767968806850256, 'soft_opc': nan} step=8780




2022-04-06 21:04.43 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_8780.pt


Epoch 11/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.45 [info     ] FQE_20220406210424: epoch=11 step=9658 epoch=11 metrics={'time_sample_batch': 0.00012059771118511644, 'time_algorithm_update': 0.001677154408500515, 'loss': 0.020398359848043505, 'time_step': 0.0018495777473362812, 'init_value': -1.367793083190918, 'ave_value': -1.3680608264101952, 'soft_opc': nan} step=9658




2022-04-06 21:04.45 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_9658.pt


Epoch 12/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.47 [info     ] FQE_20220406210424: epoch=12 step=10536 epoch=12 metrics={'time_sample_batch': 0.00011914276044841236, 'time_algorithm_update': 0.001615342626810617, 'loss': 0.020014513410849734, 'time_step': 0.001782447167572508, 'init_value': -1.3977160453796387, 'ave_value': -1.3977878215617825, 'soft_opc': nan} step=10536




2022-04-06 21:04.47 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_10536.pt


Epoch 13/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.48 [info     ] FQE_20220406210424: epoch=13 step=11414 epoch=13 metrics={'time_sample_batch': 0.0001171593785557497, 'time_algorithm_update': 0.001541713647255865, 'loss': 0.020969701635255404, 'time_step': 0.0017067272733718767, 'init_value': -1.3781163692474365, 'ave_value': -1.378278293006796, 'soft_opc': nan} step=11414




2022-04-06 21:04.48 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_11414.pt


Epoch 14/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.50 [info     ] FQE_20220406210424: epoch=14 step=12292 epoch=14 metrics={'time_sample_batch': 0.00011826349010771662, 'time_algorithm_update': 0.001582935350630984, 'loss': 0.020576597018191606, 'time_step': 0.001751461713352073, 'init_value': -1.3994345664978027, 'ave_value': -1.399671371877207, 'soft_opc': nan} step=12292




2022-04-06 21:04.50 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_12292.pt


Epoch 15/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.52 [info     ] FQE_20220406210424: epoch=15 step=13170 epoch=15 metrics={'time_sample_batch': 0.0001205721857335953, 'time_algorithm_update': 0.0016254577658442538, 'loss': 0.02208385107254952, 'time_step': 0.0017968348327150106, 'init_value': -1.529837965965271, 'ave_value': -1.5301942070492607, 'soft_opc': nan} step=13170




2022-04-06 21:04.52 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_13170.pt


Epoch 16/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.54 [info     ] FQE_20220406210424: epoch=16 step=14048 epoch=16 metrics={'time_sample_batch': 0.00012302751693204215, 'time_algorithm_update': 0.001650479225471514, 'loss': 0.02401668686485242, 'time_step': 0.0018256104345473721, 'init_value': -1.447545051574707, 'ave_value': -1.4476827883216066, 'soft_opc': nan} step=14048




2022-04-06 21:04.54 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_14048.pt


Epoch 17/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.56 [info     ] FQE_20220406210424: epoch=17 step=14926 epoch=17 metrics={'time_sample_batch': 0.00012664860093240587, 'time_algorithm_update': 0.0017293219142731338, 'loss': 0.023714415205214213, 'time_step': 0.0019118332374068762, 'init_value': -1.5350730419158936, 'ave_value': -1.5351780139298934, 'soft_opc': nan} step=14926




2022-04-06 21:04.56 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_14926.pt


Epoch 18/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.58 [info     ] FQE_20220406210424: epoch=18 step=15804 epoch=18 metrics={'time_sample_batch': 0.00011845873265711756, 'time_algorithm_update': 0.0016062946688884484, 'loss': 0.02491694912097628, 'time_step': 0.0017723890534837587, 'init_value': -1.6172972917556763, 'ave_value': -1.6174731222989363, 'soft_opc': nan} step=15804




2022-04-06 21:04.58 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_15804.pt


Epoch 19/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.59 [info     ] FQE_20220406210424: epoch=19 step=16682 epoch=19 metrics={'time_sample_batch': 0.0001194754059602568, 'time_algorithm_update': 0.0015353887662257583, 'loss': 0.027933103102488886, 'time_step': 0.001705221543279487, 'init_value': -1.6593756675720215, 'ave_value': -1.6596358832028075, 'soft_opc': nan} step=16682




2022-04-06 21:04.59 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_16682.pt


Epoch 20/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.01 [info     ] FQE_20220406210424: epoch=20 step=17560 epoch=20 metrics={'time_sample_batch': 0.00011600747466901983, 'time_algorithm_update': 0.0014790777193386625, 'loss': 0.027865742214348517, 'time_step': 0.0016430358408795402, 'init_value': -1.6144261360168457, 'ave_value': -1.6144295136646802, 'soft_opc': nan} step=17560




2022-04-06 21:05.01 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_17560.pt


Epoch 21/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.03 [info     ] FQE_20220406210424: epoch=21 step=18438 epoch=21 metrics={'time_sample_batch': 0.00011446752960969756, 'time_algorithm_update': 0.0014680792367539807, 'loss': 0.028945429940587457, 'time_step': 0.0016306826085327427, 'init_value': -1.700770378112793, 'ave_value': -1.700962106813397, 'soft_opc': nan} step=18438




2022-04-06 21:05.03 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_18438.pt


Epoch 22/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.05 [info     ] FQE_20220406210424: epoch=22 step=19316 epoch=22 metrics={'time_sample_batch': 0.00011903251222162964, 'time_algorithm_update': 0.0016463965109649171, 'loss': 0.03065991475358008, 'time_step': 0.0018147862853656325, 'init_value': -1.7784498929977417, 'ave_value': -1.7784126950016583, 'soft_opc': nan} step=19316




2022-04-06 21:05.05 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_19316.pt


Epoch 23/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.07 [info     ] FQE_20220406210424: epoch=23 step=20194 epoch=23 metrics={'time_sample_batch': 0.00011940969149995774, 'time_algorithm_update': 0.0016962564073010839, 'loss': 0.03051980458473647, 'time_step': 0.0018691231829701903, 'init_value': -1.7207039594650269, 'ave_value': -1.7209208543282162, 'soft_opc': nan} step=20194




2022-04-06 21:05.07 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_20194.pt


Epoch 24/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.08 [info     ] FQE_20220406210424: epoch=24 step=21072 epoch=24 metrics={'time_sample_batch': 0.00012046166595945597, 'time_algorithm_update': 0.0015733948058039289, 'loss': 0.02941663231342695, 'time_step': 0.0017457450984007677, 'init_value': -1.7185962200164795, 'ave_value': -1.7188698721168514, 'soft_opc': nan} step=21072




2022-04-06 21:05.08 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_21072.pt


Epoch 25/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.10 [info     ] FQE_20220406210424: epoch=25 step=21950 epoch=25 metrics={'time_sample_batch': 0.00012113293102499021, 'time_algorithm_update': 0.0016243427923980226, 'loss': 0.030682443890456153, 'time_step': 0.0017981390746687974, 'init_value': -1.7069560289382935, 'ave_value': -1.7070729736347219, 'soft_opc': nan} step=21950




2022-04-06 21:05.10 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_21950.pt


Epoch 26/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.12 [info     ] FQE_20220406210424: epoch=26 step=22828 epoch=26 metrics={'time_sample_batch': 0.00011701763083560048, 'time_algorithm_update': 0.0015407360767720772, 'loss': 0.03071431880006932, 'time_step': 0.0017082930154300768, 'init_value': -1.6983513832092285, 'ave_value': -1.6982976140311572, 'soft_opc': nan} step=22828




2022-04-06 21:05.12 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_22828.pt


Epoch 27/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.14 [info     ] FQE_20220406210424: epoch=27 step=23706 epoch=27 metrics={'time_sample_batch': 0.00011946916037105482, 'time_algorithm_update': 0.001586635183364764, 'loss': 0.029897181798683017, 'time_step': 0.0017590598799373132, 'init_value': -1.6948908567428589, 'ave_value': -1.6946368522497604, 'soft_opc': nan} step=23706




2022-04-06 21:05.14 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_23706.pt


Epoch 28/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.16 [info     ] FQE_20220406210424: epoch=28 step=24584 epoch=28 metrics={'time_sample_batch': 0.00012102349744027731, 'time_algorithm_update': 0.0016722657414395066, 'loss': 0.030481589777129912, 'time_step': 0.001844713247990011, 'init_value': -1.7193212509155273, 'ave_value': -1.7187814501638932, 'soft_opc': nan} step=24584




2022-04-06 21:05.16 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_24584.pt


Epoch 29/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.18 [info     ] FQE_20220406210424: epoch=29 step=25462 epoch=29 metrics={'time_sample_batch': 0.000130679178346534, 'time_algorithm_update': 0.0017595926558509774, 'loss': 0.030317154146846687, 'time_step': 0.0019481956008355003, 'init_value': -1.688328742980957, 'ave_value': -1.6880791452303903, 'soft_opc': nan} step=25462




2022-04-06 21:05.18 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_25462.pt


Epoch 30/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.20 [info     ] FQE_20220406210424: epoch=30 step=26340 epoch=30 metrics={'time_sample_batch': 0.00012090510279279635, 'time_algorithm_update': 0.001762576418205383, 'loss': 0.028747578993919193, 'time_step': 0.001934346414101151, 'init_value': -1.6367226839065552, 'ave_value': -1.636217466656991, 'soft_opc': nan} step=26340




2022-04-06 21:05.20 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_26340.pt


Epoch 31/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.22 [info     ] FQE_20220406210424: epoch=31 step=27218 epoch=31 metrics={'time_sample_batch': 0.0001296263892449659, 'time_algorithm_update': 0.0017609273111087044, 'loss': 0.028397038893813165, 'time_step': 0.001948844327470436, 'init_value': -1.6786599159240723, 'ave_value': -1.6786507981710257, 'soft_opc': nan} step=27218




2022-04-06 21:05.22 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_27218.pt


Epoch 32/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.24 [info     ] FQE_20220406210424: epoch=32 step=28096 epoch=32 metrics={'time_sample_batch': 0.00013689326805514464, 'time_algorithm_update': 0.001863505682804047, 'loss': 0.028813698223223463, 'time_step': 0.0020612174123186187, 'init_value': -1.6172575950622559, 'ave_value': -1.6172525389646877, 'soft_opc': nan} step=28096




2022-04-06 21:05.24 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_28096.pt


Epoch 33/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.25 [info     ] FQE_20220406210424: epoch=33 step=28974 epoch=33 metrics={'time_sample_batch': 0.00012517735735430532, 'time_algorithm_update': 0.0016660073895117687, 'loss': 0.030148957001178037, 'time_step': 0.0018464538665458662, 'init_value': -1.7241008281707764, 'ave_value': -1.7243257318063778, 'soft_opc': nan} step=28974




2022-04-06 21:05.25 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_28974.pt


Epoch 34/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.27 [info     ] FQE_20220406210424: epoch=34 step=29852 epoch=34 metrics={'time_sample_batch': 0.00012571094791003946, 'time_algorithm_update': 0.0016877414968399361, 'loss': 0.03072206311165936, 'time_step': 0.0018694585439556009, 'init_value': -1.7745884656906128, 'ave_value': -1.774942820420023, 'soft_opc': nan} step=29852




2022-04-06 21:05.27 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_29852.pt


Epoch 35/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.29 [info     ] FQE_20220406210424: epoch=35 step=30730 epoch=35 metrics={'time_sample_batch': 0.00011966304518367272, 'time_algorithm_update': 0.001569117663390001, 'loss': 0.033908389572973076, 'time_step': 0.0017401653433171927, 'init_value': -1.7972095012664795, 'ave_value': -1.7974182660960794, 'soft_opc': nan} step=30730




2022-04-06 21:05.29 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_30730.pt


Epoch 36/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.31 [info     ] FQE_20220406210424: epoch=36 step=31608 epoch=36 metrics={'time_sample_batch': 0.00012220228451531134, 'time_algorithm_update': 0.0016442496575635767, 'loss': 0.03280124183748203, 'time_step': 0.0018203261229877863, 'init_value': -1.7945419549942017, 'ave_value': -1.7946014137551443, 'soft_opc': nan} step=31608




2022-04-06 21:05.31 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_31608.pt


Epoch 37/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.33 [info     ] FQE_20220406210424: epoch=37 step=32486 epoch=37 metrics={'time_sample_batch': 0.00011819940493155721, 'time_algorithm_update': 0.0015718157579252552, 'loss': 0.03370103474006842, 'time_step': 0.0017402424627664692, 'init_value': -1.8328590393066406, 'ave_value': -1.832728560127879, 'soft_opc': nan} step=32486




2022-04-06 21:05.33 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_32486.pt


Epoch 38/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.35 [info     ] FQE_20220406210424: epoch=38 step=33364 epoch=38 metrics={'time_sample_batch': 0.00012518224520672428, 'time_algorithm_update': 0.0018510631113769253, 'loss': 0.03466861684040714, 'time_step': 0.0020357106975651003, 'init_value': -1.8051323890686035, 'ave_value': -1.8047191533493965, 'soft_opc': nan} step=33364




2022-04-06 21:05.35 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_33364.pt


Epoch 39/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.37 [info     ] FQE_20220406210424: epoch=39 step=34242 epoch=39 metrics={'time_sample_batch': 0.00012213303993937637, 'time_algorithm_update': 0.0016509984240173478, 'loss': 0.033556791395213316, 'time_step': 0.0018284703713071645, 'init_value': -1.7618515491485596, 'ave_value': -1.7615693166791926, 'soft_opc': nan} step=34242




2022-04-06 21:05.37 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_34242.pt


Epoch 40/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.39 [info     ] FQE_20220406210424: epoch=40 step=35120 epoch=40 metrics={'time_sample_batch': 0.00012398254898523143, 'time_algorithm_update': 0.0016697895010946008, 'loss': 0.03095758564344835, 'time_step': 0.0018473852739790307, 'init_value': -1.7230662107467651, 'ave_value': -1.7227696469902813, 'soft_opc': nan} step=35120




2022-04-06 21:05.39 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_35120.pt


Epoch 41/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.41 [info     ] FQE_20220406210424: epoch=41 step=35998 epoch=41 metrics={'time_sample_batch': 0.00013027240040633565, 'time_algorithm_update': 0.0017483734052653737, 'loss': 0.02996623681360947, 'time_step': 0.0019374233171988728, 'init_value': -1.6908668279647827, 'ave_value': -1.6906881037168584, 'soft_opc': nan} step=35998




2022-04-06 21:05.41 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_35998.pt


Epoch 42/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.43 [info     ] FQE_20220406210424: epoch=42 step=36876 epoch=42 metrics={'time_sample_batch': 0.0001264951766759225, 'time_algorithm_update': 0.0018648965483645916, 'loss': 0.031028719276975508, 'time_step': 0.002046155495089659, 'init_value': -1.7343724966049194, 'ave_value': -1.7341861210666598, 'soft_opc': nan} step=36876




2022-04-06 21:05.43 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_36876.pt


Epoch 43/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.45 [info     ] FQE_20220406210424: epoch=43 step=37754 epoch=43 metrics={'time_sample_batch': 0.0001277847550724525, 'time_algorithm_update': 0.001775117561322953, 'loss': 0.031078782184354614, 'time_step': 0.0019603139447188322, 'init_value': -1.669585943222046, 'ave_value': -1.6689828502239294, 'soft_opc': nan} step=37754




2022-04-06 21:05.45 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_37754.pt


Epoch 44/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.47 [info     ] FQE_20220406210424: epoch=44 step=38632 epoch=44 metrics={'time_sample_batch': 0.00012282575724608262, 'time_algorithm_update': 0.0016088423261881417, 'loss': 0.028535002879538533, 'time_step': 0.0017841788250655955, 'init_value': -1.6339770555496216, 'ave_value': -1.633242272310684, 'soft_opc': nan} step=38632




2022-04-06 21:05.47 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_38632.pt


Epoch 45/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.48 [info     ] FQE_20220406210424: epoch=45 step=39510 epoch=45 metrics={'time_sample_batch': 0.00012167140943314328, 'time_algorithm_update': 0.0015957830707141643, 'loss': 0.03143219430201576, 'time_step': 0.0017721967979552806, 'init_value': -1.8025130033493042, 'ave_value': -1.8022209830126878, 'soft_opc': nan} step=39510




2022-04-06 21:05.48 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_39510.pt


Epoch 46/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.50 [info     ] FQE_20220406210424: epoch=46 step=40388 epoch=46 metrics={'time_sample_batch': 0.00012185280306735723, 'time_algorithm_update': 0.0015828574165396374, 'loss': 0.03200157554442792, 'time_step': 0.0017581512824821038, 'init_value': -1.7609782218933105, 'ave_value': -1.7605848625938805, 'soft_opc': nan} step=40388




2022-04-06 21:05.50 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_40388.pt


Epoch 47/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.52 [info     ] FQE_20220406210424: epoch=47 step=41266 epoch=47 metrics={'time_sample_batch': 0.0001240197509730867, 'time_algorithm_update': 0.0016675378304136098, 'loss': 0.032064135305312894, 'time_step': 0.001846366156749682, 'init_value': -1.8000272512435913, 'ave_value': -1.79993457106969, 'soft_opc': nan} step=41266




2022-04-06 21:05.52 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_41266.pt


Epoch 48/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.54 [info     ] FQE_20220406210424: epoch=48 step=42144 epoch=48 metrics={'time_sample_batch': 0.00012703881448385113, 'time_algorithm_update': 0.0017728604596948297, 'loss': 0.03055592354036738, 'time_step': 0.0019577155080634534, 'init_value': -1.6800706386566162, 'ave_value': -1.679628704484835, 'soft_opc': nan} step=42144




2022-04-06 21:05.54 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_42144.pt


Epoch 49/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.56 [info     ] FQE_20220406210424: epoch=49 step=43022 epoch=49 metrics={'time_sample_batch': 0.00012174499876678398, 'time_algorithm_update': 0.0015932804902756676, 'loss': 0.02983818238063745, 'time_step': 0.0017679663616866892, 'init_value': -1.6817646026611328, 'ave_value': -1.6811345712956178, 'soft_opc': nan} step=43022




2022-04-06 21:05.56 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_43022.pt


Epoch 50/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.58 [info     ] FQE_20220406210424: epoch=50 step=43900 epoch=50 metrics={'time_sample_batch': 0.00012076579899885659, 'time_algorithm_update': 0.00155189857396015, 'loss': 0.029719315855661427, 'time_step': 0.0017268255794238392, 'init_value': -1.707073450088501, 'ave_value': -1.7068879730584705, 'soft_opc': nan} step=43900




2022-04-06 21:05.58 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210424/model_43900.pt


[(1,
  {'time_sample_batch': 0.00013119403213466218,
   'time_algorithm_update': 0.0017099345192007702,
   'loss': 0.0007221336903229773,
   'time_step': 0.001900675084976509,
   'init_value': -0.3511500060558319,
   'ave_value': -0.3517933719735239,
   'soft_opc': nan}),
 (2,
  {'time_sample_batch': 0.00013680067040654142,
   'time_algorithm_update': 0.0018210438226513004,
   'loss': 0.0021344184700107043,
   'time_step': 0.0020162573164850812,
   'init_value': -0.571056067943573,
   'ave_value': -0.5718733890903176,
   'soft_opc': nan}),
 (3,
  {'time_sample_batch': 0.00013056404226733233,
   'time_algorithm_update': 0.0017468429643635326,
   'loss': 0.004262366834274979,
   'time_step': 0.0019355219426079057,
   'init_value': -0.7475205659866333,
   'ave_value': -0.7482810573862553,
   'soft_opc': nan}),
 (4,
  {'time_sample_batch': 0.0001283913918671141,
   'time_algorithm_update': 0.0017910381112935059,
   'loss': 0.006516197041351099,
   'time_step': 0.0019766616386813294,
   'in

In [170]:
from d3rlpy.ope import FQE
# metrics to evaluate with
from d3rlpy.metrics.scorer import soft_opc_scorer


ope_dataset = get_dataset([2,4,6,8], path="collected_data/rl_stoch_small.txt") #change if you'd prefer different chunks
ope_train_episodes, ope_test_episodes = train_test_split(ope_dataset, test_size=0.2)

fqe = FQE(algo=model, use_gpu=False) #change this if you have one!
fqe.fit(ope_train_episodes, eval_episodes=ope_test_episodes,
        tensorboard_dir='runs',
        n_epochs=50, n_steps_per_epoch=1000, #change if overfitting/underfitting
        scorers={
           'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer,
           'soft_opc': soft_opc_scorer(return_threshold=0)
        })

[ 0.00000000e+00  7.95731469e+08  1.39310892e-01  1.82000047e-02
 -1.00013420e-04  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -7.50230117e-02  3.69851546e-01 -6.00000000e-01]
Read chunk # 3 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.15389108e-01  1.64000047e-02
 -8.80001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -3.08831172e-01 -2.48178665e-01  6.00000000e-01]
Read chunk # 5 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.28589108e-01  1.20000047e-02
  1.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  7.75212759e-03 -3.52719043e-01  6.00000000e-01]
Read chunk # 7 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.03989108e-01  7.00000469e-03
 -8.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.65974295e-01 -2.19295880e-01  6.00000000e-01]
Read chunk # 9 out of 10000
torch.Size([111080, 6])
2022-04-06 21:02.19 [debug    ] RoundIterator is selected.
2022-04-06 21:02.19 [info     ] Directory is created at d3rlpy_logs/FQE_2022040621021

Epoch 1/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.21 [info     ] FQE_20220406210219: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.0001383091159724972, 'time_algorithm_update': 0.0019023527045456314, 'loss': 0.0005069225541623121, 'time_step': 0.0021042742327296925, 'init_value': -0.23923806846141815, 'ave_value': -0.23953053405079883, 'soft_opc': nan} step=878




2022-04-06 21:02.21 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_878.pt


Epoch 2/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.23 [info     ] FQE_20220406210219: epoch=2 step=1756 epoch=2 metrics={'time_sample_batch': 0.00013222862756333753, 'time_algorithm_update': 0.0017707768768275788, 'loss': 0.0014712029050831187, 'time_step': 0.0019621259802294756, 'init_value': -0.4862314462661743, 'ave_value': -0.48679262416682534, 'soft_opc': nan} step=1756




2022-04-06 21:02.23 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_1756.pt


Epoch 3/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.25 [info     ] FQE_20220406210219: epoch=3 step=2634 epoch=3 metrics={'time_sample_batch': 0.00012772745858020826, 'time_algorithm_update': 0.001693446706802264, 'loss': 0.0034076808783818266, 'time_step': 0.0018776272315370738, 'init_value': -0.6859617829322815, 'ave_value': -0.6864856399976079, 'soft_opc': nan} step=2634




2022-04-06 21:02.25 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_2634.pt


Epoch 4/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.27 [info     ] FQE_20220406210219: epoch=4 step=3512 epoch=4 metrics={'time_sample_batch': 0.00012795311443354927, 'time_algorithm_update': 0.0017053353216219056, 'loss': 0.005736621589929113, 'time_step': 0.0018904692491257652, 'init_value': -0.880210280418396, 'ave_value': -0.8807683541811252, 'soft_opc': nan} step=3512




2022-04-06 21:02.27 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_3512.pt


Epoch 5/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.29 [info     ] FQE_20220406210219: epoch=5 step=4390 epoch=5 metrics={'time_sample_batch': 0.00013756290383653923, 'time_algorithm_update': 0.0018022228753648206, 'loss': 0.009038736473516263, 'time_step': 0.0020041864933891557, 'init_value': -1.0311862230300903, 'ave_value': -1.0311597087339317, 'soft_opc': nan} step=4390




2022-04-06 21:02.29 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_4390.pt


Epoch 6/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.31 [info     ] FQE_20220406210219: epoch=6 step=5268 epoch=6 metrics={'time_sample_batch': 0.00012619973315193333, 'time_algorithm_update': 0.0016046884662741137, 'loss': 0.01194321617808544, 'time_step': 0.001778669943842095, 'init_value': -1.1779879331588745, 'ave_value': -1.1781165265932234, 'soft_opc': nan} step=5268




2022-04-06 21:02.31 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_5268.pt


Epoch 7/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.33 [info     ] FQE_20220406210219: epoch=7 step=6146 epoch=7 metrics={'time_sample_batch': 0.00012431709532857214, 'time_algorithm_update': 0.0016365013252901326, 'loss': 0.014008609075624292, 'time_step': 0.0018106717998183127, 'init_value': -1.2091140747070312, 'ave_value': -1.208512345855314, 'soft_opc': nan} step=6146




2022-04-06 21:02.33 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_6146.pt


Epoch 8/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.35 [info     ] FQE_20220406210219: epoch=8 step=7024 epoch=8 metrics={'time_sample_batch': 0.00013475374643243256, 'time_algorithm_update': 0.0017792556714902978, 'loss': 0.01535429607861416, 'time_step': 0.0019700771583783055, 'init_value': -1.2797765731811523, 'ave_value': -1.279652354747169, 'soft_opc': nan} step=7024




2022-04-06 21:02.35 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_7024.pt


Epoch 9/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.37 [info     ] FQE_20220406210219: epoch=9 step=7902 epoch=9 metrics={'time_sample_batch': 0.00013561808166851486, 'time_algorithm_update': 0.0017727963745186703, 'loss': 0.018037873913516515, 'time_step': 0.0019664577036620817, 'init_value': -1.410714864730835, 'ave_value': -1.410865859827248, 'soft_opc': nan} step=7902




2022-04-06 21:02.37 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_7902.pt


Epoch 10/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.39 [info     ] FQE_20220406210219: epoch=10 step=8780 epoch=10 metrics={'time_sample_batch': 0.0001352778328306854, 'time_algorithm_update': 0.0019023068130423649, 'loss': 0.02099806404698203, 'time_step': 0.0020953568890860522, 'init_value': -1.4759947061538696, 'ave_value': -1.4763996711987681, 'soft_opc': nan} step=8780




2022-04-06 21:02.39 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_8780.pt


Epoch 11/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.41 [info     ] FQE_20220406210219: epoch=11 step=9658 epoch=11 metrics={'time_sample_batch': 0.0001305922831924195, 'time_algorithm_update': 0.0016903578556208513, 'loss': 0.023446520869705997, 'time_step': 0.0018787386748676691, 'init_value': -1.5548245906829834, 'ave_value': -1.555478130903895, 'soft_opc': nan} step=9658




2022-04-06 21:02.41 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_9658.pt


Epoch 12/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.43 [info     ] FQE_20220406210219: epoch=12 step=10536 epoch=12 metrics={'time_sample_batch': 0.00013364773404896937, 'time_algorithm_update': 0.0017768554644052423, 'loss': 0.025006495535546455, 'time_step': 0.0019694398367323474, 'init_value': -1.609846591949463, 'ave_value': -1.6107754991522905, 'soft_opc': nan} step=10536




2022-04-06 21:02.43 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_10536.pt


Epoch 13/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.45 [info     ] FQE_20220406210219: epoch=13 step=11414 epoch=13 metrics={'time_sample_batch': 0.0001343743947752516, 'time_algorithm_update': 0.001772081661876079, 'loss': 0.025964169778442382, 'time_step': 0.001966392803843852, 'init_value': -1.6176029443740845, 'ave_value': -1.6183215640439728, 'soft_opc': nan} step=11414




2022-04-06 21:02.45 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_11414.pt


Epoch 14/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.47 [info     ] FQE_20220406210219: epoch=14 step=12292 epoch=14 metrics={'time_sample_batch': 0.00013725605532357253, 'time_algorithm_update': 0.0018357106384763956, 'loss': 0.025776181719821104, 'time_step': 0.002031968231896333, 'init_value': -1.5935897827148438, 'ave_value': -1.594396188733003, 'soft_opc': nan} step=12292




2022-04-06 21:02.47 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_12292.pt


Epoch 15/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.48 [info     ] FQE_20220406210219: epoch=15 step=13170 epoch=15 metrics={'time_sample_batch': 0.00012588392357619857, 'time_algorithm_update': 0.0016354200237161207, 'loss': 0.025494986002848646, 'time_step': 0.0018145994607842865, 'init_value': -1.5803693532943726, 'ave_value': -1.5809312636434218, 'soft_opc': nan} step=13170




2022-04-06 21:02.48 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_13170.pt


Epoch 16/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.50 [info     ] FQE_20220406210219: epoch=16 step=14048 epoch=16 metrics={'time_sample_batch': 0.0001305249394479808, 'time_algorithm_update': 0.0017857464679552917, 'loss': 0.025711538695833955, 'time_step': 0.00197185307809052, 'init_value': -1.5912272930145264, 'ave_value': -1.5920282098085718, 'soft_opc': nan} step=14048




2022-04-06 21:02.50 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_14048.pt


Epoch 17/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.52 [info     ] FQE_20220406210219: epoch=17 step=14926 epoch=17 metrics={'time_sample_batch': 0.0001326753229649572, 'time_algorithm_update': 0.001766126628345672, 'loss': 0.024763258001050607, 'time_step': 0.0019555618659781973, 'init_value': -1.5504961013793945, 'ave_value': -1.5514802185150087, 'soft_opc': nan} step=14926




2022-04-06 21:02.52 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_14926.pt


Epoch 18/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.54 [info     ] FQE_20220406210219: epoch=18 step=15804 epoch=18 metrics={'time_sample_batch': 0.00013332513578931943, 'time_algorithm_update': 0.0017747460845391136, 'loss': 0.025052436050960556, 'time_step': 0.0019676226418219285, 'init_value': -1.5552146434783936, 'ave_value': -1.5571418396250118, 'soft_opc': nan} step=15804




2022-04-06 21:02.54 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_15804.pt


Epoch 19/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.56 [info     ] FQE_20220406210219: epoch=19 step=16682 epoch=19 metrics={'time_sample_batch': 0.00012539676761844434, 'time_algorithm_update': 0.001670516976462953, 'loss': 0.024925014244601165, 'time_step': 0.0018503660493125133, 'init_value': -1.5638470649719238, 'ave_value': -1.5656521901972253, 'soft_opc': nan} step=16682




2022-04-06 21:02.56 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_16682.pt


Epoch 20/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:02.58 [info     ] FQE_20220406210219: epoch=20 step=17560 epoch=20 metrics={'time_sample_batch': 0.00013497234205450174, 'time_algorithm_update': 0.0017899345428362523, 'loss': 0.023395209312265103, 'time_step': 0.0019822785957527597, 'init_value': -1.512223720550537, 'ave_value': -1.5140342632081731, 'soft_opc': nan} step=17560




2022-04-06 21:02.58 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_17560.pt


Epoch 21/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.00 [info     ] FQE_20220406210219: epoch=21 step=18438 epoch=21 metrics={'time_sample_batch': 0.00013357495935739853, 'time_algorithm_update': 0.0017857624892493316, 'loss': 0.02360748610895196, 'time_step': 0.0019782319970445914, 'init_value': -1.5189058780670166, 'ave_value': -1.5211425590432521, 'soft_opc': nan} step=18438




2022-04-06 21:03.00 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_18438.pt


Epoch 22/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.02 [info     ] FQE_20220406210219: epoch=22 step=19316 epoch=22 metrics={'time_sample_batch': 0.00013084699461291755, 'time_algorithm_update': 0.0017876627776508723, 'loss': 0.023798913196916977, 'time_step': 0.0019754049175149486, 'init_value': -1.538705587387085, 'ave_value': -1.5409150541317378, 'soft_opc': nan} step=19316




2022-04-06 21:03.02 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_19316.pt


Epoch 23/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.04 [info     ] FQE_20220406210219: epoch=23 step=20194 epoch=23 metrics={'time_sample_batch': 0.00013409307171380601, 'time_algorithm_update': 0.0018534823267769434, 'loss': 0.024713333472838737, 'time_step': 0.0020468935608049187, 'init_value': -1.567566156387329, 'ave_value': -1.569759906445227, 'soft_opc': nan} step=20194




2022-04-06 21:03.04 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_20194.pt


Epoch 24/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.06 [info     ] FQE_20220406210219: epoch=24 step=21072 epoch=24 metrics={'time_sample_batch': 0.00013093470440910183, 'time_algorithm_update': 0.0017790131797008472, 'loss': 0.025424438653087376, 'time_step': 0.001967289453215371, 'init_value': -1.559641718864441, 'ave_value': -1.5618334703362386, 'soft_opc': nan} step=21072




2022-04-06 21:03.06 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_21072.pt


Epoch 25/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.08 [info     ] FQE_20220406210219: epoch=25 step=21950 epoch=25 metrics={'time_sample_batch': 0.00013585459941612018, 'time_algorithm_update': 0.0018466681174102296, 'loss': 0.024720752616736792, 'time_step': 0.002041663287169298, 'init_value': -1.558950424194336, 'ave_value': -1.5616943147452773, 'soft_opc': nan} step=21950




2022-04-06 21:03.08 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_21950.pt


Epoch 26/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.10 [info     ] FQE_20220406210219: epoch=26 step=22828 epoch=26 metrics={'time_sample_batch': 0.00013146530794391328, 'time_algorithm_update': 0.0017115833547500924, 'loss': 0.02660025551734165, 'time_step': 0.0018986887160629237, 'init_value': -1.6107735633850098, 'ave_value': -1.6136401036948556, 'soft_opc': nan} step=22828




2022-04-06 21:03.10 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_22828.pt


Epoch 27/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.12 [info     ] FQE_20220406210219: epoch=27 step=23706 epoch=27 metrics={'time_sample_batch': 0.00013022705199778215, 'time_algorithm_update': 0.0017469792811365497, 'loss': 0.027880391562695984, 'time_step': 0.0019345910782694545, 'init_value': -1.6538159847259521, 'ave_value': -1.657028465931218, 'soft_opc': nan} step=23706




2022-04-06 21:03.12 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_23706.pt


Epoch 28/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.14 [info     ] FQE_20220406210219: epoch=28 step=24584 epoch=28 metrics={'time_sample_batch': 0.00012848942046284948, 'time_algorithm_update': 0.001713400006565798, 'loss': 0.028936911467998115, 'time_step': 0.0018960283666102382, 'init_value': -1.644697904586792, 'ave_value': -1.6480302004600969, 'soft_opc': nan} step=24584




2022-04-06 21:03.14 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_24584.pt


Epoch 29/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.16 [info     ] FQE_20220406210219: epoch=29 step=25462 epoch=29 metrics={'time_sample_batch': 0.00013577449294592092, 'time_algorithm_update': 0.0018668082417551098, 'loss': 0.028011683153337782, 'time_step': 0.002064459959303871, 'init_value': -1.6676106452941895, 'ave_value': -1.670664367019933, 'soft_opc': nan} step=25462




2022-04-06 21:03.16 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_25462.pt


Epoch 30/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.18 [info     ] FQE_20220406210219: epoch=30 step=26340 epoch=30 metrics={'time_sample_batch': 0.00013536744345836596, 'time_algorithm_update': 0.0018123233508412007, 'loss': 0.028518628432252804, 'time_step': 0.00200924759302161, 'init_value': -1.6886237859725952, 'ave_value': -1.6912541051011816, 'soft_opc': nan} step=26340




2022-04-06 21:03.18 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_26340.pt


Epoch 31/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.20 [info     ] FQE_20220406210219: epoch=31 step=27218 epoch=31 metrics={'time_sample_batch': 0.00013483575373412807, 'time_algorithm_update': 0.0018705007427102613, 'loss': 0.030101074455168703, 'time_step': 0.0020639212093483614, 'init_value': -1.7260853052139282, 'ave_value': -1.7292447865168599, 'soft_opc': nan} step=27218




2022-04-06 21:03.20 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_27218.pt


Epoch 32/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.22 [info     ] FQE_20220406210219: epoch=32 step=28096 epoch=32 metrics={'time_sample_batch': 0.00013432796017727168, 'time_algorithm_update': 0.0020014267576039515, 'loss': 0.030840439585820216, 'time_step': 0.0021966940178143406, 'init_value': -1.7261669635772705, 'ave_value': -1.7287344194821443, 'soft_opc': nan} step=28096




2022-04-06 21:03.22 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_28096.pt


Epoch 33/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.24 [info     ] FQE_20220406210219: epoch=33 step=28974 epoch=33 metrics={'time_sample_batch': 0.00013376612869645034, 'time_algorithm_update': 0.0018014220822101844, 'loss': 0.0310472582937741, 'time_step': 0.0019957508747561374, 'init_value': -1.7709027528762817, 'ave_value': -1.7733663410149487, 'soft_opc': nan} step=28974




2022-04-06 21:03.24 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_28974.pt


Epoch 34/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.27 [info     ] FQE_20220406210219: epoch=34 step=29852 epoch=34 metrics={'time_sample_batch': 0.00013748931450289855, 'time_algorithm_update': 0.0020117322513345704, 'loss': 0.031327120063614364, 'time_step': 0.002208362136180417, 'init_value': -1.7605375051498413, 'ave_value': -1.7629668906330387, 'soft_opc': nan} step=29852




2022-04-06 21:03.27 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_29852.pt


Epoch 35/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.29 [info     ] FQE_20220406210219: epoch=35 step=30730 epoch=35 metrics={'time_sample_batch': 0.00013603110519791518, 'time_algorithm_update': 0.0018634125420607303, 'loss': 0.031052603193856984, 'time_step': 0.002059602248641516, 'init_value': -1.7315768003463745, 'ave_value': -1.734156340664375, 'soft_opc': nan} step=30730




2022-04-06 21:03.29 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_30730.pt


Epoch 36/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.31 [info     ] FQE_20220406210219: epoch=36 step=31608 epoch=36 metrics={'time_sample_batch': 0.00011848534429806512, 'time_algorithm_update': 0.0016150653769595204, 'loss': 0.031556867361658095, 'time_step': 0.0017842602892725776, 'init_value': -1.7726742029190063, 'ave_value': -1.7750384697081705, 'soft_opc': nan} step=31608




2022-04-06 21:03.31 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_31608.pt


Epoch 37/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.32 [info     ] FQE_20220406210219: epoch=37 step=32486 epoch=37 metrics={'time_sample_batch': 0.00012441946868201322, 'time_algorithm_update': 0.00165507190591382, 'loss': 0.030148700324374708, 'time_step': 0.0018325101813314172, 'init_value': -1.7123291492462158, 'ave_value': -1.7139277640403967, 'soft_opc': nan} step=32486




2022-04-06 21:03.32 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_32486.pt


Epoch 38/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.34 [info     ] FQE_20220406210219: epoch=38 step=33364 epoch=38 metrics={'time_sample_batch': 0.0001314908333954344, 'time_algorithm_update': 0.0017613582567636407, 'loss': 0.02897752035173415, 'time_step': 0.0019499812962525526, 'init_value': -1.6924774646759033, 'ave_value': -1.6939405287964953, 'soft_opc': nan} step=33364




2022-04-06 21:03.34 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_33364.pt


Epoch 39/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.36 [info     ] FQE_20220406210219: epoch=39 step=34242 epoch=39 metrics={'time_sample_batch': 0.00013230031606548196, 'time_algorithm_update': 0.0017694680185687298, 'loss': 0.029416942081223165, 'time_step': 0.001959686127230355, 'init_value': -1.7062898874282837, 'ave_value': -1.7074661853440123, 'soft_opc': nan} step=34242




2022-04-06 21:03.36 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_34242.pt


Epoch 40/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.38 [info     ] FQE_20220406210219: epoch=40 step=35120 epoch=40 metrics={'time_sample_batch': 0.00013173060971131902, 'time_algorithm_update': 0.0017556275213497917, 'loss': 0.0292531257796938, 'time_step': 0.0019456585338827146, 'init_value': -1.7254873514175415, 'ave_value': -1.7263258494652391, 'soft_opc': nan} step=35120




2022-04-06 21:03.38 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_35120.pt


Epoch 41/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.40 [info     ] FQE_20220406210219: epoch=41 step=35998 epoch=41 metrics={'time_sample_batch': 0.00013338677703926937, 'time_algorithm_update': 0.0018511825922138327, 'loss': 0.02931167684982524, 'time_step': 0.002045271065349188, 'init_value': -1.7310595512390137, 'ave_value': -1.7318062713981328, 'soft_opc': nan} step=35998




2022-04-06 21:03.40 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_35998.pt


Epoch 42/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.42 [info     ] FQE_20220406210219: epoch=42 step=36876 epoch=42 metrics={'time_sample_batch': 0.00013202442395116853, 'time_algorithm_update': 0.0017699720104625935, 'loss': 0.02966944878858106, 'time_step': 0.0019584448842633016, 'init_value': -1.7386541366577148, 'ave_value': -1.7394115270039965, 'soft_opc': nan} step=36876




2022-04-06 21:03.42 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_36876.pt


Epoch 43/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.44 [info     ] FQE_20220406210219: epoch=43 step=37754 epoch=43 metrics={'time_sample_batch': 0.0001323929137140852, 'time_algorithm_update': 0.0017446659692056086, 'loss': 0.029200620505207036, 'time_step': 0.0019352411626411734, 'init_value': -1.7242590188980103, 'ave_value': -1.7251829498379097, 'soft_opc': nan} step=37754




2022-04-06 21:03.44 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_37754.pt


Epoch 44/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.46 [info     ] FQE_20220406210219: epoch=44 step=38632 epoch=44 metrics={'time_sample_batch': 0.0001335782179256778, 'time_algorithm_update': 0.0017602001072877088, 'loss': 0.030449239157290846, 'time_step': 0.001952337241118481, 'init_value': -1.7742050886154175, 'ave_value': -1.7755045778872218, 'soft_opc': nan} step=38632




2022-04-06 21:03.46 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_38632.pt


Epoch 45/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.48 [info     ] FQE_20220406210219: epoch=45 step=39510 epoch=45 metrics={'time_sample_batch': 0.00012597000408824321, 'time_algorithm_update': 0.0016282039242616275, 'loss': 0.031367611590649755, 'time_step': 0.0018078553106355775, 'init_value': -1.8093619346618652, 'ave_value': -1.8106713386615905, 'soft_opc': nan} step=39510




2022-04-06 21:03.48 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_39510.pt


Epoch 46/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.50 [info     ] FQE_20220406210219: epoch=46 step=40388 epoch=46 metrics={'time_sample_batch': 0.0001203780293736208, 'time_algorithm_update': 0.0017868815359059118, 'loss': 0.032330161137444545, 'time_step': 0.0019602023387552668, 'init_value': -1.8366779088974, 'ave_value': -1.8381861725454827, 'soft_opc': nan} step=40388




2022-04-06 21:03.50 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_40388.pt


Epoch 47/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.52 [info     ] FQE_20220406210219: epoch=47 step=41266 epoch=47 metrics={'time_sample_batch': 0.00012633197671460127, 'time_algorithm_update': 0.001611474706263097, 'loss': 0.03299169837477071, 'time_step': 0.0017939374224200063, 'init_value': -1.8409712314605713, 'ave_value': -1.8420576181468764, 'soft_opc': nan} step=41266




2022-04-06 21:03.52 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_41266.pt


Epoch 48/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.54 [info     ] FQE_20220406210219: epoch=48 step=42144 epoch=48 metrics={'time_sample_batch': 0.000125673202827471, 'time_algorithm_update': 0.0017447835492110197, 'loss': 0.034425932963823305, 'time_step': 0.0019265217770205, 'init_value': -1.871069312095642, 'ave_value': -1.8722195903147, 'soft_opc': nan} step=42144




2022-04-06 21:03.54 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_42144.pt


Epoch 49/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.56 [info     ] FQE_20220406210219: epoch=49 step=43022 epoch=49 metrics={'time_sample_batch': 0.00012404663416139085, 'time_algorithm_update': 0.001651215661902634, 'loss': 0.0342785973865341, 'time_step': 0.001831260248848952, 'init_value': -1.8171542882919312, 'ave_value': -1.8181784213812298, 'soft_opc': nan} step=43022




2022-04-06 21:03.56 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_43022.pt


Epoch 50/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.58 [info     ] FQE_20220406210219: epoch=50 step=43900 epoch=50 metrics={'time_sample_batch': 0.0001237134455548332, 'time_algorithm_update': 0.0016329122838778486, 'loss': 0.03252738003676129, 'time_step': 0.0018108640553467909, 'init_value': -1.825412631034851, 'ave_value': -1.826603330390665, 'soft_opc': nan} step=43900




2022-04-06 21:03.58 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210219/model_43900.pt


[(1,
  {'time_sample_batch': 0.0001383091159724972,
   'time_algorithm_update': 0.0019023527045456314,
   'loss': 0.0005069225541623121,
   'time_step': 0.0021042742327296925,
   'init_value': -0.23923806846141815,
   'ave_value': -0.23953053405079883,
   'soft_opc': nan}),
 (2,
  {'time_sample_batch': 0.00013222862756333753,
   'time_algorithm_update': 0.0017707768768275788,
   'loss': 0.0014712029050831187,
   'time_step': 0.0019621259802294756,
   'init_value': -0.4862314462661743,
   'ave_value': -0.48679262416682534,
   'soft_opc': nan}),
 (3,
  {'time_sample_batch': 0.00012772745858020826,
   'time_algorithm_update': 0.001693446706802264,
   'loss': 0.0034076808783818266,
   'time_step': 0.0018776272315370738,
   'init_value': -0.6859617829322815,
   'ave_value': -0.6864856399976079,
   'soft_opc': nan}),
 (4,
  {'time_sample_batch': 0.00012795311443354927,
   'time_algorithm_update': 0.0017053353216219056,
   'loss': 0.005736621589929113,
   'time_step': 0.0018904692491257652,
 