# Sample Workflow for d3rlpy Experiments

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import math
import subprocess
import os
import d3rlpy
plt.style.use('matplotlibrc')

from Python.data_sampler import *

## Building an MDPDataset

We first read in a large batch of samples from the file. As `d3rlpy` wants it in the form (observations, actions, rewards, terminal flags), we go ahead and do that. Here's a helper function to get a dataset from a list of chunks of your choosing.

In [30]:
def get_dataset(chunks : list, batch_size=30000, 
                path="collected_data/rl_det_small.txt") -> d3rlpy.dataset.MDPDataset :
    random.seed(0)
    samples = DataSampler(path_to_data=path)
    states = []
    actions = []
    rewards = []
    next_states = []
    for chunk in chunks:
        samples.use_chunk(chunk)
        samples.read_chunk()
        [statesChunk, actionsChunk, rewardsChunk, nextStatesChunk] = samples.get_batch(batch_size)
        states.append(statesChunk)
        actions.append(actionsChunk)
        rewards.append(rewardsChunk)
        next_states.append(nextStatesChunk)
    states = torch.cat(states)
    actions = torch.cat(actions)
    rewards = torch.cat(rewards)
    next_states = torch.cat(next_states)
    terminals = np.zeros(len(states))
    terminals[::100] = 1 #episode length 100, change if necessary
    print(states.shape)
    dataset = d3rlpy.dataset.MDPDataset(states.numpy(), 
                                        actions.numpy(), 
                                        rewards.numpy(), terminals)
    return dataset

We can build the dataset from there, just like this, and split into train and test sets.

In [47]:
dataset = get_dataset([3,5,7,9], path="collected_data/rl_det_small.txt")

[ 0.00000000e+00  7.95731469e+08 -8.17891077e-02 -1.19999531e-03
  7.39998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  2.09713430e-01 -2.63658359e-01  6.00000000e-01]
Read chunk # 4 out of 10000
[ 0.00000000e+00  7.95731469e+08  1.24610892e-01  2.40000469e-03
 -7.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.20016566e-01  3.79282423e-01 -6.00000000e-01]
Read chunk # 6 out of 10000
[ 0.00000000e+00  7.95731469e+08 -9.01891077e-02  1.08000047e-02
  3.99986580e-04  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -3.17973088e-02 -2.40776052e-01  6.00000000e-01]
Read chunk # 8 out of 10000
[ 0.00000000e+00  7.95731469e+08  6.91108923e-02 -5.99999531e-03
 -6.00001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -1.42355434e-01  2.22081792e-01 -6.00000000e-01]
Read chunk # 10 out of 10000
torch.Size([111080, 6])


In [48]:
print("The behavior policy value statistics are:")
dataset.compute_stats()['return']

The behavior policy value statistics are:


{'mean': -4.1227446,
 'std': 2.4676569,
 'min': -12.578855,
 'max': 0.0,
 'histogram': (array([ 26,   9,   7,   7,   8,   7,  10,  13,  27,  54,  56,  73, 109,
          84, 186, 148, 124,  83,  67,  13]),
  array([-12.578855 , -11.949912 , -11.320969 , -10.692026 , -10.063084 ,
          -9.434141 ,  -8.805199 ,  -8.176255 ,  -7.5473127,  -6.9183702,
          -6.2894273,  -5.6604843,  -5.031542 ,  -4.4025993,  -3.7736564,
          -3.1447136,  -2.515771 ,  -1.8868282,  -1.2578855,  -0.6289427,
           0.       ], dtype=float32))}

In [49]:
from sklearn.model_selection import train_test_split
train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

## Setting up an Algorithm

In [50]:
from d3rlpy.dynamics import ProbabilisticEnsembleDynamics
from d3rlpy.metrics.scorer import dynamics_observation_prediction_error_scorer
from d3rlpy.metrics.scorer import dynamics_reward_prediction_error_scorer
from d3rlpy.metrics.scorer import dynamics_prediction_variance_scorer

dynamics = d3rlpy.dynamics.ProbabilisticEnsembleDynamics(learning_rate=1e-4, use_gpu=False)

# same as algorithms
dynamics.fit(train_episodes,
             eval_episodes=test_episodes,
             n_epochs=15, tensorboard_dir='runs',
             scorers={
                'observation_error': dynamics_observation_prediction_error_scorer,
                'reward_error': dynamics_reward_prediction_error_scorer,
                'variance': dynamics_prediction_variance_scorer,
             })


#from d3rlpy.algos import MOPO

# load trained dynamics model
#dynamics = ProbabilisticEnsembleDynamics.from_json('<path-to-params.json>/params.json')
#dynamics.load_model('<path-to-model>/model_xx.pt')

# give mopo as generator argument.
#mopo = MOPO(dynamics=dynamics)


2022-04-06 21:00.48 [debug    ] RoundIterator is selected.
2022-04-06 21:00.48 [info     ] Directory is created at d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048
2022-04-06 21:00.48 [debug    ] Building models...
2022-04-06 21:00.48 [debug    ] Models have been built.
2022-04-06 21:00.48 [info     ] Parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/params.json params={'action_scaler': None, 'batch_size': 100, 'discrete_action': False, 'encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'gamma': 1.0, 'generated_maxlen': 100000, 'learning_rate': 0.0001, 'n_ensembles': 5, 'n_frames': 1, 'n_steps': 1, 'optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0.0001, 'amsgrad': False}, 'real_ratio': 1.0, 'reward_scaler': None, 'scaler': None, 'use_gpu': None, 'variance_type': 'max', 'algorithm': 'ProbabilisticEnsembleDynamics', 'observation_shape': 

Epoch 1/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:00.55 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.00015752923787319036, 'time_algorithm_update': 0.006990694239362226, 'loss': -130.10344823763828, 'time_step': 0.007213200414913933, 'observation_error': 2.5002094379265388e-05, 'reward_error': 8.086704723235035e-05, 'variance': 2.9265567855343244e-08} step=878
2022-04-06 21:00.55 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_878.pt


Epoch 2/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:01.02 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=2 step=1756 epoch=2 metrics={'time_sample_batch': 0.0001431915374443037, 'time_algorithm_update': 0.006612565903022783, 'loss': -155.84424964713617, 'time_step': 0.006800605930337059, 'observation_error': 6.961562183296232e-05, 'reward_error': 5.49882593996615e-05, 'variance': 4.21097555442327e-07} step=1756
2022-04-06 21:01.02 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_1756.pt


Epoch 3/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:01.10 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=3 step=2634 epoch=3 metrics={'time_sample_batch': 0.00015931710566909546, 'time_algorithm_update': 0.007381611370008465, 'loss': -163.18751751806306, 'time_step': 0.007593790599588381, 'observation_error': 1.946347884231508e-05, 'reward_error': 3.987882613107416e-05, 'variance': 5.701755339153832e-07} step=2634
2022-04-06 21:01.10 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_2634.pt


Epoch 4/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:01.17 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=4 step=3512 epoch=4 metrics={'time_sample_batch': 0.00015286486892873987, 'time_algorithm_update': 0.00704823974053246, 'loss': -164.09990947610424, 'time_step': 0.007249272765765701, 'observation_error': 2.8595266688735806e-05, 'reward_error': 4.5131388945379754e-05, 'variance': 2.466563539237384e-07} step=3512
2022-04-06 21:01.17 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_3512.pt


Epoch 5/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:01.25 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=5 step=4390 epoch=5 metrics={'time_sample_batch': 0.00014871942698276667, 'time_algorithm_update': 0.007037449534770296, 'loss': -168.1954182687816, 'time_step': 0.007232749923758192, 'observation_error': 2.5864836179857006e-05, 'reward_error': 2.3896695763135876e-05, 'variance': 1.934993643421245e-07} step=4390
2022-04-06 21:01.25 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_4390.pt


Epoch 6/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:01.32 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=6 step=5268 epoch=6 metrics={'time_sample_batch': 0.00014885004126129499, 'time_algorithm_update': 0.0070672958478297625, 'loss': -171.9209967680564, 'time_step': 0.007261119562292425, 'observation_error': 0.0004114405069486184, 'reward_error': 1.703930859763823e-05, 'variance': 6.653350081625358e-07} step=5268
2022-04-06 21:01.32 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_5268.pt


Epoch 7/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:01.40 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=7 step=6146 epoch=7 metrics={'time_sample_batch': 0.00015474804984681427, 'time_algorithm_update': 0.00706845997134754, 'loss': -172.51383661083318, 'time_step': 0.007277257350148264, 'observation_error': 3.3621584076657654e-05, 'reward_error': 6.803582295647045e-06, 'variance': 3.366537925459627e-07} step=6146
2022-04-06 21:01.40 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_6146.pt


Epoch 8/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:01.47 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=8 step=7024 epoch=8 metrics={'time_sample_batch': 0.00014432823467906353, 'time_algorithm_update': 0.006781665773761028, 'loss': -177.687073744511, 'time_step': 0.006969245256758496, 'observation_error': 1.8171233390369198e-05, 'reward_error': 5.698470655588482e-06, 'variance': 1.7679288111123523e-07} step=7024
2022-04-06 21:01.47 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_7024.pt


Epoch 9/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:01.54 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=9 step=7902 epoch=9 metrics={'time_sample_batch': 0.00014286568061637445, 'time_algorithm_update': 0.0066848352449630005, 'loss': -178.989001550001, 'time_step': 0.006871286991788478, 'observation_error': 1.390524506965647e-05, 'reward_error': 4.010948354876198e-06, 'variance': 1.1752816384569891e-07} step=7902
2022-04-06 21:01.54 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_7902.pt


Epoch 10/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:02.01 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=10 step=8780 epoch=10 metrics={'time_sample_batch': 0.00014911018362892515, 'time_algorithm_update': 0.006967430505774287, 'loss': -180.6555078284887, 'time_step': 0.00716249899331964, 'observation_error': 1.3691348734461574e-05, 'reward_error': 2.716611547193946e-06, 'variance': 7.13109355911905e-08} step=8780
2022-04-06 21:02.01 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_8780.pt


Epoch 11/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:02.08 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=11 step=9658 epoch=11 metrics={'time_sample_batch': 0.00014690005969349507, 'time_algorithm_update': 0.006828625815875981, 'loss': -180.90111153326708, 'time_step': 0.007021068440482937, 'observation_error': 2.2872261593555398e-05, 'reward_error': 3.030942098924994e-06, 'variance': 1.8740818755884698e-07} step=9658
2022-04-06 21:02.08 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_9658.pt


Epoch 12/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:02.16 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=12 step=10536 epoch=12 metrics={'time_sample_batch': 0.00014533676156150454, 'time_algorithm_update': 0.006828884872054184, 'loss': -182.0599985796118, 'time_step': 0.007017980403943594, 'observation_error': 1.3254254976208847e-05, 'reward_error': 3.7971511254762056e-06, 'variance': 4.5205523911177825e-07} step=10536
2022-04-06 21:02.16 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_10536.pt


Epoch 13/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:02.23 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=13 step=11414 epoch=13 metrics={'time_sample_batch': 0.00014605744824594137, 'time_algorithm_update': 0.006893797453009186, 'loss': -185.1279143722291, 'time_step': 0.007086290856971828, 'observation_error': 1.2353382125145977e-05, 'reward_error': 1.9279379690356336e-06, 'variance': 1.5518268463793187e-07} step=11414
2022-04-06 21:02.23 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_11414.pt


Epoch 14/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:02.30 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=14 step=12292 epoch=14 metrics={'time_sample_batch': 0.00013669992633723996, 'time_algorithm_update': 0.00662463102362422, 'loss': -186.35415094154027, 'time_step': 0.0068031807423724125, 'observation_error': 1.2594006513682804e-05, 'reward_error': 3.0540305854605354e-06, 'variance': 2.454543805049739e-07} step=12292
2022-04-06 21:02.30 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_12292.pt


Epoch 15/15:   0%|          | 0/878 [00:00<?, ?it/s]

2022-04-06 21:02.37 [info     ] ProbabilisticEnsembleDynamics_20220406210048: epoch=15 step=13170 epoch=15 metrics={'time_sample_batch': 0.00014018985296436216, 'time_algorithm_update': 0.006717660432524453, 'loss': -188.058661486945, 'time_step': 0.0068993592859672255, 'observation_error': 1.0353658180396897e-05, 'reward_error': 2.9548844142438814e-06, 'variance': 3.397890133426663e-08} step=13170
2022-04-06 21:02.37 [info     ] Model parameters are saved to d3rlpy_logs/ProbabilisticEnsembleDynamics_20220406210048/model_13170.pt


[(1,
  {'time_sample_batch': 0.00015752923787319036,
   'time_algorithm_update': 0.006990694239362226,
   'loss': -130.10344823763828,
   'time_step': 0.007213200414913933,
   'observation_error': 2.5002094379265388e-05,
   'reward_error': 8.086704723235035e-05,
   'variance': 2.9265567855343244e-08}),
 (2,
  {'time_sample_batch': 0.0001431915374443037,
   'time_algorithm_update': 0.006612565903022783,
   'loss': -155.84424964713617,
   'time_step': 0.006800605930337059,
   'observation_error': 6.961562183296232e-05,
   'reward_error': 5.49882593996615e-05,
   'variance': 4.21097555442327e-07}),
 (3,
  {'time_sample_batch': 0.00015931710566909546,
   'time_algorithm_update': 0.007381611370008465,
   'loss': -163.18751751806306,
   'time_step': 0.007593790599588381,
   'observation_error': 1.946347884231508e-05,
   'reward_error': 3.987882613107416e-05,
   'variance': 5.701755339153832e-07}),
 (4,
  {'time_sample_batch': 0.00015286486892873987,
   'time_algorithm_update': 0.007048239740

In [12]:
%load_ext tensorboard
%tensorboard --logdir runs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 77774), started 0:06:26 ago. (Use '!kill 77774' to kill it.)

In [51]:
from d3rlpy.algos import COMBO
from d3rlpy.metrics.scorer import td_error_scorer
from d3rlpy.metrics.scorer import average_value_estimation_scorer
from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer

# calculate metrics with test dataset
ave_error_init = average_value_estimation_scorer(model, test_episodes)
print(ave_error_init)



model = COMBO(q_func_factory='mean', #qr -> quantile regression q function, but you don't have to use this
            reward_scaler='standard',
        dynamics=dynamics,
          actor_learning_rate=3e-5, 
          critic_learning_rate=0.0003, 
            use_gpu=False) #change it to true if you have one
model.build_with_dataset(dataset)

model.fit(train_episodes,
        eval_episodes=test_episodes,
        n_epochs=5, 
        tensorboard_dir='runs',
        scorers={
            'td_error': td_error_scorer,
            'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer
        })

7.825822524261004
2022-04-06 21:02.37 [debug    ] RoundIterator is selected.
2022-04-06 21:02.37 [info     ] Directory is created at d3rlpy_logs/COMBO_20220406210237
2022-04-06 21:02.37 [debug    ] Fitting reward scaler...       reward_scaler=standard
2022-04-06 21:02.37 [info     ] Parameters are saved to d3rlpy_logs/COMBO_20220406210237/params.json params={'action_scaler': None, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'actor_learning_rate': 3e-05, 'actor_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'batch_size': 256, 'conservative_weight': 1.0, 'critic_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'critic_learning_rate': 0.0003, 'critic_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'gamma': 0.

Epoch 1/5:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-06 21:02.39 [debug    ] 250000 transitions are generated. fake_transitions=250000 real_transitions=87813
2022-04-06 21:02.47 [info     ] COMBO_20220406210237: epoch=1 step=343 epoch=1 metrics={'time_sample_batch': 0.00029832142087530465, 'time_algorithm_update': 0.022318651655325043, 'critic_loss': 8.361403107295578, 'actor_loss': -0.34950176004114025, 'temp_loss': 4.2451235399996925, 'temp': 0.9834807690656567, 'time_step': 0.022685573677974958, 'td_error': 0.7849345799815419, 'init_value': 1.2416236400604248, 'ave_value': 1.2467925996318105} step=343
2022-04-06 21:02.47 [info     ] Model parameters are saved to d3rlpy_logs/COMBO_20220406210237/model_343.pt


Epoch 2/5:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-06 21:02.56 [info     ] COMBO_20220406210237: epoch=2 step=686 epoch=2 metrics={'time_sample_batch': 0.0004691702283853692, 'time_algorithm_update': 0.022114493756530584, 'critic_loss': 10.902209612787987, 'actor_loss': -1.2672697422651265, 'temp_loss': 4.124696317984133, 'temp': 0.9538840698431255, 'time_step': 0.022652481457234818, 'td_error': 0.38186449550558743, 'init_value': 1.7204447984695435, 'ave_value': 1.7246237275531804} step=686
2022-04-06 21:02.56 [info     ] Model parameters are saved to d3rlpy_logs/COMBO_20220406210237/model_686.pt


Epoch 3/5:   0%|          | 0/686 [00:00<?, ?it/s]

2022-04-06 21:03.05 [debug    ] 250000 transitions are generated. fake_transitions=500000 real_transitions=87813
2022-04-06 21:03.14 [info     ] COMBO_20220406210237: epoch=3 step=1372 epoch=3 metrics={'time_sample_batch': 0.0004773946267175257, 'time_algorithm_update': 0.022580020629282247, 'critic_loss': 9.356939674466414, 'actor_loss': -2.4688512112934458, 'temp_loss': 3.9773411139107306, 'temp': 0.9068102789689778, 'time_step': 0.02313340335823704, 'td_error': 0.5500945781026063, 'init_value': 3.9303884506225586, 'ave_value': 3.9425782593386556} step=1372
2022-04-06 21:03.14 [info     ] Model parameters are saved to d3rlpy_logs/COMBO_20220406210237/model_1372.pt


Epoch 4/5:   0%|          | 0/686 [00:00<?, ?it/s]

2022-04-06 21:03.30 [debug    ] 250000 transitions are generated. fake_transitions=750000 real_transitions=87813
2022-04-06 21:03.32 [info     ] COMBO_20220406210237: epoch=4 step=2058 epoch=4 metrics={'time_sample_batch': 0.00047442760133882306, 'time_algorithm_update': 0.022584208594119234, 'critic_loss': 9.187148015631184, 'actor_loss': -4.4536585303854315, 'temp_loss': 3.6122775547010906, 'temp': 0.8502891756181467, 'time_step': 0.023136194871396433, 'td_error': 1.0253053341514484, 'init_value': 5.48727560043335, 'ave_value': 5.514245598049522} step=2058
2022-04-06 21:03.32 [info     ] Model parameters are saved to d3rlpy_logs/COMBO_20220406210237/model_2058.pt


Epoch 5/5:   0%|          | 0/686 [00:00<?, ?it/s]

2022-04-06 21:03.48 [info     ] COMBO_20220406210237: epoch=5 step=2744 epoch=5 metrics={'time_sample_batch': 0.0004726339012123753, 'time_algorithm_update': 0.021931777195054656, 'critic_loss': 9.289198818429218, 'actor_loss': -6.1472682966782815, 'temp_loss': 3.331260523017572, 'temp': 0.7984906840776216, 'time_step': 0.022479728081483536, 'td_error': 1.6410055861786605, 'init_value': 6.912630081176758, 'ave_value': 6.957811238080075} step=2744
2022-04-06 21:03.48 [info     ] Model parameters are saved to d3rlpy_logs/COMBO_20220406210237/model_2744.pt


[(1,
  {'time_sample_batch': 0.00029832142087530465,
   'time_algorithm_update': 0.022318651655325043,
   'critic_loss': 8.361403107295578,
   'actor_loss': -0.34950176004114025,
   'temp_loss': 4.2451235399996925,
   'temp': 0.9834807690656567,
   'time_step': 0.022685573677974958,
   'td_error': 0.7849345799815419,
   'init_value': 1.2416236400604248,
   'ave_value': 1.2467925996318105}),
 (2,
  {'time_sample_batch': 0.0004691702283853692,
   'time_algorithm_update': 0.022114493756530584,
   'critic_loss': 10.902209612787987,
   'actor_loss': -1.2672697422651265,
   'temp_loss': 4.124696317984133,
   'temp': 0.9538840698431255,
   'time_step': 0.022652481457234818,
   'td_error': 0.38186449550558743,
   'init_value': 1.7204447984695435,
   'ave_value': 1.7246237275531804}),
 (3,
  {'time_sample_batch': 0.0004773946267175257,
   'time_algorithm_update': 0.022580020629282247,
   'critic_loss': 9.356939674466414,
   'actor_loss': -2.4688512112934458,
   'temp_loss': 3.9773411139107306,


## Off-Policy Evaluation

We do get some metrics on a test set of initial state value and average value. However, these estimates (using the critic's Q-function) of model performance are biased. They're useful for validation during training, but not much else. Instead, we fit a Q-function to the data (or a separate dataset, as I've done here) separately and evaluate the model's performance on it.

Feel free to change the chunks and number of steps.

In [52]:
from d3rlpy.ope import FQE
# metrics to evaluate with
from d3rlpy.metrics.scorer import soft_opc_scorer

ope_dataset = get_dataset([2,4,6,8], path="collected_data/rl_det_small.txt") #change if you'd prefer different chunks
ope_train_episodes, ope_test_episodes = train_test_split(ope_dataset, test_size=0.2)

fqe = FQE(algo=model, use_gpu=False) #change this if you have one!
fqe.fit(ope_train_episodes, eval_episodes=ope_test_episodes, 
        tensorboard_dir='runs',
        n_epochs=50, n_steps_per_epoch=1000, #change if overfitting/underfitting
        scorers={
           'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer,
           'soft_opc': soft_opc_scorer(return_threshold=0)
        })

[ 0.00000000e+00  7.95731469e+08 -1.03891077e-02 -1.41999953e-02
 -2.10001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -1.78778459e-03 -1.34615461e-02  4.84073546e-02]
Read chunk # 3 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.24891077e-02 -1.35999953e-02
 -4.20001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -6.23311010e-02 -1.64283998e-01  6.00000000e-01]
Read chunk # 5 out of 10000
[ 0.00000000e+00  7.95731469e+08  7.01089229e-03 -4.19999531e-03
  7.39998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  2.21623335e-01 -2.86362315e-02 -8.00043364e-02]
Read chunk # 7 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.03989108e-01 -1.37999953e-02
  7.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  2.76352555e-01 -3.26280816e-01  6.00000000e-01]
Read chunk # 9 out of 10000
torch.Size([111080, 6])
2022-04-06 21:03.49 [debug    ] RoundIterator is selected.
2022-04-06 21:03.49 [info     ] Directory is created at d3rlpy_logs/FQE_2022040621034

Epoch 1/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.50 [info     ] FQE_20220406210349: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.00012873598546264927, 'time_algorithm_update': 0.001486791293550461, 'loss': 0.00026482629515046024, 'time_step': 0.0016694283431103127, 'init_value': -0.18876513838768005, 'ave_value': -0.18954894687402812, 'soft_opc': nan} step=878




2022-04-06 21:03.50 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_878.pt


Epoch 2/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.52 [info     ] FQE_20220406210349: epoch=2 step=1756 epoch=2 metrics={'time_sample_batch': 0.0001276546838886374, 'time_algorithm_update': 0.0015851612243130973, 'loss': 0.0007146051946667641, 'time_step': 0.0017693012884917726, 'init_value': -0.2967626452445984, 'ave_value': -0.2972891433198647, 'soft_opc': nan} step=1756




2022-04-06 21:03.52 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_1756.pt


Epoch 3/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.54 [info     ] FQE_20220406210349: epoch=3 step=2634 epoch=3 metrics={'time_sample_batch': 0.00012643407852068578, 'time_algorithm_update': 0.0016615939303248237, 'loss': 0.0012968413317400534, 'time_step': 0.0018410397553498218, 'init_value': -0.4171195924282074, 'ave_value': -0.4174483638273156, 'soft_opc': nan} step=2634




2022-04-06 21:03.54 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_2634.pt


Epoch 4/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.56 [info     ] FQE_20220406210349: epoch=4 step=3512 epoch=4 metrics={'time_sample_batch': 0.00012486073313650075, 'time_algorithm_update': 0.0015624658393425387, 'loss': 0.00205733014416489, 'time_step': 0.0017407282609874404, 'init_value': -0.5100998282432556, 'ave_value': -0.510274434613972, 'soft_opc': nan} step=3512




2022-04-06 21:03.56 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_3512.pt


Epoch 5/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.57 [info     ] FQE_20220406210349: epoch=5 step=4390 epoch=5 metrics={'time_sample_batch': 0.00012519880959547733, 'time_algorithm_update': 0.0015716449546379488, 'loss': 0.002931711669505371, 'time_step': 0.0017511206498721738, 'init_value': -0.5442318916320801, 'ave_value': -0.54426777774551, 'soft_opc': nan} step=4390




2022-04-06 21:03.57 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_4390.pt


Epoch 6/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:03.59 [info     ] FQE_20220406210349: epoch=6 step=5268 epoch=6 metrics={'time_sample_batch': 0.00010462339483795514, 'time_algorithm_update': 0.0012948238768175685, 'loss': 0.0034336098094818867, 'time_step': 0.0014427292048252254, 'init_value': -0.6346944570541382, 'ave_value': -0.6344719484050879, 'soft_opc': nan} step=5268




2022-04-06 21:03.59 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_5268.pt


Epoch 7/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.00 [info     ] FQE_20220406210349: epoch=7 step=6146 epoch=7 metrics={'time_sample_batch': 0.0001052142818859335, 'time_algorithm_update': 0.0013060376964560403, 'loss': 0.004343272875984831, 'time_step': 0.0014549596977668363, 'init_value': -0.6856957674026489, 'ave_value': -0.6852442257623663, 'soft_opc': nan} step=6146




2022-04-06 21:04.00 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_6146.pt


Epoch 8/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.02 [info     ] FQE_20220406210349: epoch=8 step=7024 epoch=8 metrics={'time_sample_batch': 0.00011213222234287132, 'time_algorithm_update': 0.0013899072699231818, 'loss': 0.004766666127117812, 'time_step': 0.0015504140245615756, 'init_value': -0.6967993974685669, 'ave_value': -0.6960851119286763, 'soft_opc': nan} step=7024




2022-04-06 21:04.02 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_7024.pt


Epoch 9/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.04 [info     ] FQE_20220406210349: epoch=9 step=7902 epoch=9 metrics={'time_sample_batch': 0.0001123114435982324, 'time_algorithm_update': 0.0013972271004678992, 'loss': 0.005560340047141796, 'time_step': 0.0015585987334370884, 'init_value': -0.7748829126358032, 'ave_value': -0.7739987973056799, 'soft_opc': nan} step=7902




2022-04-06 21:04.04 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_7902.pt


Epoch 10/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.05 [info     ] FQE_20220406210349: epoch=10 step=8780 epoch=10 metrics={'time_sample_batch': 0.00010230465195988197, 'time_algorithm_update': 0.0012734074798966323, 'loss': 0.006610026745240086, 'time_step': 0.0014191515624930485, 'init_value': -0.855553925037384, 'ave_value': -0.8546864779165496, 'soft_opc': nan} step=8780




2022-04-06 21:04.05 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_8780.pt


Epoch 11/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.07 [info     ] FQE_20220406210349: epoch=11 step=9658 epoch=11 metrics={'time_sample_batch': 0.00010815378202121187, 'time_algorithm_update': 0.0013825230826549487, 'loss': 0.007521865196765925, 'time_step': 0.0015361852146226888, 'init_value': -0.9016329646110535, 'ave_value': -0.9007947301499604, 'soft_opc': nan} step=9658




2022-04-06 21:04.07 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_9658.pt


Epoch 12/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.08 [info     ] FQE_20220406210349: epoch=12 step=10536 epoch=12 metrics={'time_sample_batch': 0.00011017436590172978, 'time_algorithm_update': 0.0013994964217270698, 'loss': 0.007986642295353433, 'time_step': 0.0015582859108822765, 'init_value': -0.9097276329994202, 'ave_value': -0.9092903990414435, 'soft_opc': nan} step=10536




2022-04-06 21:04.08 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_10536.pt


Epoch 13/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.10 [info     ] FQE_20220406210349: epoch=13 step=11414 epoch=13 metrics={'time_sample_batch': 0.00010970513206951168, 'time_algorithm_update': 0.0013593560199259625, 'loss': 0.008906486512400813, 'time_step': 0.001517310772506957, 'init_value': -0.9684196710586548, 'ave_value': -0.9676022156131447, 'soft_opc': nan} step=11414




2022-04-06 21:04.10 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_11414.pt


Epoch 14/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.11 [info     ] FQE_20220406210349: epoch=14 step=12292 epoch=14 metrics={'time_sample_batch': 0.00010758217483555265, 'time_algorithm_update': 0.0013617309731068536, 'loss': 0.009710270782493364, 'time_step': 0.0015159834490278588, 'init_value': -1.0077133178710938, 'ave_value': -1.0070498156748415, 'soft_opc': nan} step=12292




2022-04-06 21:04.11 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_12292.pt


Epoch 15/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.13 [info     ] FQE_20220406210349: epoch=15 step=13170 epoch=15 metrics={'time_sample_batch': 0.00010824475038567545, 'time_algorithm_update': 0.0013646742749051245, 'loss': 0.009926598658212594, 'time_step': 0.0015155834597715757, 'init_value': -1.006379246711731, 'ave_value': -1.005781907768965, 'soft_opc': nan} step=13170




2022-04-06 21:04.13 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_13170.pt


Epoch 16/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.14 [info     ] FQE_20220406210349: epoch=16 step=14048 epoch=16 metrics={'time_sample_batch': 0.00010847800956500147, 'time_algorithm_update': 0.0013628671272469001, 'loss': 0.00981940318077432, 'time_step': 0.001519343847565879, 'init_value': -0.9781472086906433, 'ave_value': -0.9776438263553681, 'soft_opc': nan} step=14048




2022-04-06 21:04.14 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_14048.pt


Epoch 17/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.16 [info     ] FQE_20220406210349: epoch=17 step=14926 epoch=17 metrics={'time_sample_batch': 0.0001134782825895757, 'time_algorithm_update': 0.0013964358114707442, 'loss': 0.00962598658516424, 'time_step': 0.001561621870158202, 'init_value': -0.978163480758667, 'ave_value': -0.9776887024135247, 'soft_opc': nan} step=14926




2022-04-06 21:04.16 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_14926.pt


Epoch 18/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.18 [info     ] FQE_20220406210349: epoch=18 step=15804 epoch=18 metrics={'time_sample_batch': 0.00011673196301644918, 'time_algorithm_update': 0.0014647734192346387, 'loss': 0.009549127089320706, 'time_step': 0.0016288700299273862, 'init_value': -0.9850871562957764, 'ave_value': -0.9845384580423668, 'soft_opc': nan} step=15804




2022-04-06 21:04.18 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_15804.pt


Epoch 19/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.19 [info     ] FQE_20220406210349: epoch=19 step=16682 epoch=19 metrics={'time_sample_batch': 0.00011418593500089537, 'time_algorithm_update': 0.0015165990468852884, 'loss': 0.010091787978406062, 'time_step': 0.001684350413450619, 'init_value': -1.0397145748138428, 'ave_value': -1.039304403894356, 'soft_opc': nan} step=16682




2022-04-06 21:04.19 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_16682.pt


Epoch 20/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.21 [info     ] FQE_20220406210349: epoch=20 step=17560 epoch=20 metrics={'time_sample_batch': 0.00011446046937842576, 'time_algorithm_update': 0.0014208875647438413, 'loss': 0.010080873073375001, 'time_step': 0.0015857032328368862, 'init_value': -0.9988876581192017, 'ave_value': -0.9984071463293834, 'soft_opc': nan} step=17560




2022-04-06 21:04.21 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_17560.pt


Epoch 21/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.23 [info     ] FQE_20220406210349: epoch=21 step=18438 epoch=21 metrics={'time_sample_batch': 0.00011595452293448133, 'time_algorithm_update': 0.0014129972132028914, 'loss': 0.00939842583711072, 'time_step': 0.0015797357081280754, 'init_value': -0.9841761589050293, 'ave_value': -0.9837058802583422, 'soft_opc': nan} step=18438




2022-04-06 21:04.23 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_18438.pt


Epoch 22/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.24 [info     ] FQE_20220406210349: epoch=22 step=19316 epoch=22 metrics={'time_sample_batch': 0.00011206569324050243, 'time_algorithm_update': 0.001390818854399314, 'loss': 0.009182520656389139, 'time_step': 0.0015519113366859105, 'init_value': -0.966090738773346, 'ave_value': -0.9654797735613596, 'soft_opc': nan} step=19316




2022-04-06 21:04.24 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_19316.pt


Epoch 23/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.26 [info     ] FQE_20220406210349: epoch=23 step=20194 epoch=23 metrics={'time_sample_batch': 0.00012898906759900764, 'time_algorithm_update': 0.0015801378697632114, 'loss': 0.009096322330611379, 'time_step': 0.0017675785920614534, 'init_value': -0.9831241369247437, 'ave_value': -0.9826143079571422, 'soft_opc': nan} step=20194




2022-04-06 21:04.26 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_20194.pt


Epoch 24/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.28 [info     ] FQE_20220406210349: epoch=24 step=21072 epoch=24 metrics={'time_sample_batch': 0.0001323649443363546, 'time_algorithm_update': 0.0016383573514725462, 'loss': 0.009450954489992017, 'time_step': 0.0018305026317240165, 'init_value': -0.9886300563812256, 'ave_value': -0.9879731048777164, 'soft_opc': nan} step=21072




2022-04-06 21:04.28 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_21072.pt


Epoch 25/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.30 [info     ] FQE_20220406210349: epoch=25 step=21950 epoch=25 metrics={'time_sample_batch': 0.00013187615909446073, 'time_algorithm_update': 0.001649934501474159, 'loss': 0.009939520506632438, 'time_step': 0.0018405373927400976, 'init_value': -1.0323238372802734, 'ave_value': -1.0318427323544486, 'soft_opc': nan} step=21950




2022-04-06 21:04.30 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_21950.pt


Epoch 26/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.32 [info     ] FQE_20220406210349: epoch=26 step=22828 epoch=26 metrics={'time_sample_batch': 0.0001355906553854975, 'time_algorithm_update': 0.0018297995956177591, 'loss': 0.010741480848063928, 'time_step': 0.0020234185633735396, 'init_value': -1.0637402534484863, 'ave_value': -1.0630802139778222, 'soft_opc': nan} step=22828




2022-04-06 21:04.32 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_22828.pt


Epoch 27/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.34 [info     ] FQE_20220406210349: epoch=27 step=23706 epoch=27 metrics={'time_sample_batch': 0.00013176672550974783, 'time_algorithm_update': 0.0016833014260120435, 'loss': 0.010903637418354418, 'time_step': 0.001867795859491092, 'init_value': -1.0467240810394287, 'ave_value': -1.046329939619713, 'soft_opc': nan} step=23706




2022-04-06 21:04.34 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_23706.pt


Epoch 28/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.36 [info     ] FQE_20220406210349: epoch=28 step=24584 epoch=28 metrics={'time_sample_batch': 0.00012121928308439146, 'time_algorithm_update': 0.0015356168660053088, 'loss': 0.011187156007828832, 'time_step': 0.0017037842431209625, 'init_value': -1.054868221282959, 'ave_value': -1.0545387138446485, 'soft_opc': nan} step=24584




2022-04-06 21:04.36 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_24584.pt


Epoch 29/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.37 [info     ] FQE_20220406210349: epoch=29 step=25462 epoch=29 metrics={'time_sample_batch': 0.00012322357412351291, 'time_algorithm_update': 0.001563571851726002, 'loss': 0.012138191059762252, 'time_step': 0.001737281510390019, 'init_value': -1.162110686302185, 'ave_value': -1.1614025097996579, 'soft_opc': nan} step=25462




2022-04-06 21:04.37 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_25462.pt


Epoch 30/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.39 [info     ] FQE_20220406210349: epoch=30 step=26340 epoch=30 metrics={'time_sample_batch': 0.0001244496104385967, 'time_algorithm_update': 0.0015936647297852674, 'loss': 0.013023593653510664, 'time_step': 0.001769238017957683, 'init_value': -1.1353942155838013, 'ave_value': -1.1347990890676065, 'soft_opc': nan} step=26340




2022-04-06 21:04.39 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_26340.pt


Epoch 31/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.41 [info     ] FQE_20220406210349: epoch=31 step=27218 epoch=31 metrics={'time_sample_batch': 0.00012680202518888922, 'time_algorithm_update': 0.0016073670093996921, 'loss': 0.013139474333756113, 'time_step': 0.0017851493353181114, 'init_value': -1.1621512174606323, 'ave_value': -1.1614299428394406, 'soft_opc': nan} step=27218




2022-04-06 21:04.41 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_27218.pt


Epoch 32/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.43 [info     ] FQE_20220406210349: epoch=32 step=28096 epoch=32 metrics={'time_sample_batch': 0.00012188213018187087, 'time_algorithm_update': 0.0015414217338475117, 'loss': 0.013235146816295498, 'time_step': 0.0017115567431091447, 'init_value': -1.1823135614395142, 'ave_value': -1.1816039349190777, 'soft_opc': nan} step=28096




2022-04-06 21:04.43 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_28096.pt


Epoch 33/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.45 [info     ] FQE_20220406210349: epoch=33 step=28974 epoch=33 metrics={'time_sample_batch': 0.0001258708893030814, 'time_algorithm_update': 0.0016715010640832991, 'loss': 0.012874954023032356, 'time_step': 0.001847913162040276, 'init_value': -1.158544659614563, 'ave_value': -1.1573493378870934, 'soft_opc': nan} step=28974




2022-04-06 21:04.45 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_28974.pt


Epoch 34/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.46 [info     ] FQE_20220406210349: epoch=34 step=29852 epoch=34 metrics={'time_sample_batch': 0.00012091460695027762, 'time_algorithm_update': 0.0015246333185252405, 'loss': 0.013123990123036194, 'time_step': 0.0016941619625395687, 'init_value': -1.1542315483093262, 'ave_value': -1.1531439594245774, 'soft_opc': nan} step=29852




2022-04-06 21:04.46 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_29852.pt


Epoch 35/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.48 [info     ] FQE_20220406210349: epoch=35 step=30730 epoch=35 metrics={'time_sample_batch': 0.00011774564929866574, 'time_algorithm_update': 0.0014789579669543986, 'loss': 0.013129642643245072, 'time_step': 0.0016423920020970234, 'init_value': -1.1325730085372925, 'ave_value': -1.131432250133323, 'soft_opc': nan} step=30730




2022-04-06 21:04.48 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_30730.pt


Epoch 36/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.50 [info     ] FQE_20220406210349: epoch=36 step=31608 epoch=36 metrics={'time_sample_batch': 0.00012169774952673423, 'time_algorithm_update': 0.0015298559888348765, 'loss': 0.012480924252218892, 'time_step': 0.0017015784639432382, 'init_value': -1.1202141046524048, 'ave_value': -1.1191341531996484, 'soft_opc': nan} step=31608




2022-04-06 21:04.50 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_31608.pt


Epoch 37/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.51 [info     ] FQE_20220406210349: epoch=37 step=32486 epoch=37 metrics={'time_sample_batch': 0.00012041115815112694, 'time_algorithm_update': 0.001516125196748008, 'loss': 0.01232358361559698, 'time_step': 0.0016829983791620694, 'init_value': -1.0868902206420898, 'ave_value': -1.0857014214047034, 'soft_opc': nan} step=32486




2022-04-06 21:04.51 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_32486.pt


Epoch 38/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.53 [info     ] FQE_20220406210349: epoch=38 step=33364 epoch=38 metrics={'time_sample_batch': 0.00012091433540292101, 'time_algorithm_update': 0.0015333923500599786, 'loss': 0.011327263214572506, 'time_step': 0.00169823870300432, 'init_value': -1.0475090742111206, 'ave_value': -1.0464220454377822, 'soft_opc': nan} step=33364




2022-04-06 21:04.53 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_33364.pt


Epoch 39/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.55 [info     ] FQE_20220406210349: epoch=39 step=34242 epoch=39 metrics={'time_sample_batch': 0.00012360075340184102, 'time_algorithm_update': 0.00160211121031255, 'loss': 0.01130375715273428, 'time_step': 0.0017752302534759451, 'init_value': -1.092948317527771, 'ave_value': -1.0916371175915283, 'soft_opc': nan} step=34242




2022-04-06 21:04.55 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_34242.pt


Epoch 40/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.57 [info     ] FQE_20220406210349: epoch=40 step=35120 epoch=40 metrics={'time_sample_batch': 0.00012344841533478408, 'time_algorithm_update': 0.0015859783103091298, 'loss': 0.011377011873327752, 'time_step': 0.001757483275984849, 'init_value': -1.0715545415878296, 'ave_value': -1.0704115527222735, 'soft_opc': nan} step=35120




2022-04-06 21:04.57 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_35120.pt


Epoch 41/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:04.59 [info     ] FQE_20220406210349: epoch=41 step=35998 epoch=41 metrics={'time_sample_batch': 0.00012369280795573103, 'time_algorithm_update': 0.0015492805858950951, 'loss': 0.011026281666148144, 'time_step': 0.0017227512828852972, 'init_value': -1.0581353902816772, 'ave_value': -1.0571506165878202, 'soft_opc': nan} step=35998




2022-04-06 21:04.59 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_35998.pt


Epoch 42/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.00 [info     ] FQE_20220406210349: epoch=42 step=36876 epoch=42 metrics={'time_sample_batch': 0.0001169122704612367, 'time_algorithm_update': 0.0014560138446051874, 'loss': 0.01068961521715688, 'time_step': 0.0016164271869529082, 'init_value': -1.0758672952651978, 'ave_value': -1.0749184304499273, 'soft_opc': nan} step=36876




2022-04-06 21:05.00 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_36876.pt


Epoch 43/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.02 [info     ] FQE_20220406210349: epoch=43 step=37754 epoch=43 metrics={'time_sample_batch': 0.00011813369047125815, 'time_algorithm_update': 0.0014683119528385935, 'loss': 0.010334005813085266, 'time_step': 0.0016354365881048737, 'init_value': -1.0412214994430542, 'ave_value': -1.0401297405523446, 'soft_opc': nan} step=37754




2022-04-06 21:05.02 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_37754.pt


Epoch 44/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.04 [info     ] FQE_20220406210349: epoch=44 step=38632 epoch=44 metrics={'time_sample_batch': 0.00011736086669435263, 'time_algorithm_update': 0.0014564369153867822, 'loss': 0.011266781011510792, 'time_step': 0.0016197946457222002, 'init_value': -1.0928630828857422, 'ave_value': -1.0916006532452778, 'soft_opc': nan} step=38632




2022-04-06 21:05.04 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_38632.pt


Epoch 45/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.05 [info     ] FQE_20220406210349: epoch=45 step=39510 epoch=45 metrics={'time_sample_batch': 0.00011893747064681695, 'time_algorithm_update': 0.0015501693603932722, 'loss': 0.011230586664487108, 'time_step': 0.001717420265180375, 'init_value': -1.052027940750122, 'ave_value': -1.0508492030073413, 'soft_opc': nan} step=39510




2022-04-06 21:05.05 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_39510.pt


Epoch 46/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.07 [info     ] FQE_20220406210349: epoch=46 step=40388 epoch=46 metrics={'time_sample_batch': 0.000117670702228242, 'time_algorithm_update': 0.0014867375271738527, 'loss': 0.010897536025797913, 'time_step': 0.0016488135539860823, 'init_value': -1.0336993932724, 'ave_value': -1.0326984709369784, 'soft_opc': nan} step=40388




2022-04-06 21:05.07 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_40388.pt


Epoch 47/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.09 [info     ] FQE_20220406210349: epoch=47 step=41266 epoch=47 metrics={'time_sample_batch': 0.00012109274201621227, 'time_algorithm_update': 0.0015309698760916813, 'loss': 0.01082330633747592, 'time_step': 0.001701280304945683, 'init_value': -1.0921905040740967, 'ave_value': -1.0911232546581362, 'soft_opc': nan} step=41266




2022-04-06 21:05.09 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_41266.pt


Epoch 48/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.11 [info     ] FQE_20220406210349: epoch=48 step=42144 epoch=48 metrics={'time_sample_batch': 0.00012102295434556409, 'time_algorithm_update': 0.0015146490653174885, 'loss': 0.011661128615346497, 'time_step': 0.0016821440911781815, 'init_value': -1.1154160499572754, 'ave_value': -1.1143687740315906, 'soft_opc': nan} step=42144




2022-04-06 21:05.11 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_42144.pt


Epoch 49/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.12 [info     ] FQE_20220406210349: epoch=49 step=43022 epoch=49 metrics={'time_sample_batch': 0.00012097298963194828, 'time_algorithm_update': 0.0015310236424682896, 'loss': 0.012446001351321132, 'time_step': 0.0016994193909108503, 'init_value': -1.1369554996490479, 'ave_value': -1.1360788647342421, 'soft_opc': nan} step=43022




2022-04-06 21:05.12 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_43022.pt


Epoch 50/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:05.14 [info     ] FQE_20220406210349: epoch=50 step=43900 epoch=50 metrics={'time_sample_batch': 0.00012210941531935152, 'time_algorithm_update': 0.0016487456671469304, 'loss': 0.012167671869002937, 'time_step': 0.0018187525060562447, 'init_value': -1.1216630935668945, 'ave_value': -1.120769039818994, 'soft_opc': nan} step=43900




2022-04-06 21:05.14 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210349/model_43900.pt


[(1,
  {'time_sample_batch': 0.00012873598546264927,
   'time_algorithm_update': 0.001486791293550461,
   'loss': 0.00026482629515046024,
   'time_step': 0.0016694283431103127,
   'init_value': -0.18876513838768005,
   'ave_value': -0.18954894687402812,
   'soft_opc': nan}),
 (2,
  {'time_sample_batch': 0.0001276546838886374,
   'time_algorithm_update': 0.0015851612243130973,
   'loss': 0.0007146051946667641,
   'time_step': 0.0017693012884917726,
   'init_value': -0.2967626452445984,
   'ave_value': -0.2972891433198647,
   'soft_opc': nan}),
 (3,
  {'time_sample_batch': 0.00012643407852068578,
   'time_algorithm_update': 0.0016615939303248237,
   'loss': 0.0012968413317400534,
   'time_step': 0.0018410397553498218,
   'init_value': -0.4171195924282074,
   'ave_value': -0.4174483638273156,
   'soft_opc': nan}),
 (4,
  {'time_sample_batch': 0.00012486073313650075,
   'time_algorithm_update': 0.0015624658393425387,
   'loss': 0.00205733014416489,
   'time_step': 0.0017407282609874404,
  

In [55]:
from d3rlpy.ope import FQE
# metrics to evaluate with
from d3rlpy.metrics.scorer import soft_opc_scorer

ope_dataset = get_dataset([2,4,6,8], path="collected_data/rl_stoch_small.txt") #change if you'd prefer different chunks
ope_train_episodes, ope_test_episodes = train_test_split(ope_dataset, test_size=0.2)

fqe = FQE(algo=model, use_gpu=False) #change this if you have one!
fqe.fit(ope_train_episodes, eval_episodes=ope_test_episodes, 
        tensorboard_dir='runs',
        n_epochs=50, n_steps_per_epoch=1000, #change if overfitting/underfitting
        scorers={
           'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer,
           'soft_opc': soft_opc_scorer(return_threshold=0)
        })

[ 0.00000000e+00  7.95731469e+08  1.39310892e-01  1.82000047e-02
 -1.00013420e-04  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -7.50230117e-02  3.69851546e-01 -6.00000000e-01]
Read chunk # 3 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.15389108e-01  1.64000047e-02
 -8.80001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -3.08831172e-01 -2.48178665e-01  6.00000000e-01]
Read chunk # 5 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.28589108e-01  1.20000047e-02
  1.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  7.75212759e-03 -3.52719043e-01  6.00000000e-01]
Read chunk # 7 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.03989108e-01  7.00000469e-03
 -8.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.65974295e-01 -2.19295880e-01  6.00000000e-01]
Read chunk # 9 out of 10000
torch.Size([111080, 6])
2022-04-06 21:07.38 [debug    ] RoundIterator is selected.
2022-04-06 21:07.38 [info     ] Directory is created at d3rlpy_logs/FQE_2022040621073

Epoch 1/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.40 [info     ] FQE_20220406210738: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.00011441077621216655, 'time_algorithm_update': 0.001423091714637426, 'loss': 0.0012160921468460657, 'time_step': 0.0015893791694032847, 'init_value': -0.48166346549987793, 'ave_value': -0.48177142936358197, 'soft_opc': nan} step=878




2022-04-06 21:07.40 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_878.pt


Epoch 2/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.42 [info     ] FQE_20220406210738: epoch=2 step=1756 epoch=2 metrics={'time_sample_batch': 0.00011742630960729508, 'time_algorithm_update': 0.0014501519518180968, 'loss': 0.004580605509955626, 'time_step': 0.001620275827638109, 'init_value': -0.8132042288780212, 'ave_value': -0.813346434212975, 'soft_opc': nan} step=1756




2022-04-06 21:07.42 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_1756.pt


Epoch 3/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.43 [info     ] FQE_20220406210738: epoch=3 step=2634 epoch=3 metrics={'time_sample_batch': 0.00010611663375194088, 'time_algorithm_update': 0.0013551312861518599, 'loss': 0.010369343410954709, 'time_step': 0.001508662532293715, 'init_value': -1.1891367435455322, 'ave_value': -1.1893279653407203, 'soft_opc': nan} step=2634




2022-04-06 21:07.43 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_2634.pt


Epoch 4/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.45 [info     ] FQE_20220406210738: epoch=4 step=3512 epoch=4 metrics={'time_sample_batch': 0.00010223975214165272, 'time_algorithm_update': 0.0012770095555820335, 'loss': 0.016514954606140823, 'time_step': 0.0014186293769262919, 'init_value': -1.3240128755569458, 'ave_value': -1.3241843364374488, 'soft_opc': nan} step=3512




2022-04-06 21:07.45 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_3512.pt


Epoch 5/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.46 [info     ] FQE_20220406210738: epoch=5 step=4390 epoch=5 metrics={'time_sample_batch': 0.00011217512682521533, 'time_algorithm_update': 0.0013975203716130355, 'loss': 0.021056382262657214, 'time_step': 0.0015564160357846757, 'init_value': -1.4839987754821777, 'ave_value': -1.4841913196435468, 'soft_opc': nan} step=4390




2022-04-06 21:07.46 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_4390.pt


Epoch 6/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.48 [info     ] FQE_20220406210738: epoch=6 step=5268 epoch=6 metrics={'time_sample_batch': 0.00010780728759418044, 'time_algorithm_update': 0.0013895089099510384, 'loss': 0.02457656273003696, 'time_step': 0.0015430227770620707, 'init_value': -1.5111372470855713, 'ave_value': -1.511319507989146, 'soft_opc': nan} step=5268




2022-04-06 21:07.48 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_5268.pt


Epoch 7/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.49 [info     ] FQE_20220406210738: epoch=7 step=6146 epoch=7 metrics={'time_sample_batch': 0.00011219630751903073, 'time_algorithm_update': 0.0013773406013540906, 'loss': 0.0270455762783429, 'time_step': 0.0015381408986849775, 'init_value': -1.638981580734253, 'ave_value': -1.6392109259674825, 'soft_opc': nan} step=6146




2022-04-06 21:07.49 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_6146.pt


Epoch 8/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.51 [info     ] FQE_20220406210738: epoch=8 step=7024 epoch=8 metrics={'time_sample_batch': 0.00011371914509488672, 'time_algorithm_update': 0.0013882858606568773, 'loss': 0.03107505308242808, 'time_step': 0.0015489881293920286, 'init_value': -1.7358195781707764, 'ave_value': -1.7360410706074756, 'soft_opc': nan} step=7024




2022-04-06 21:07.51 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_7024.pt


Epoch 9/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.52 [info     ] FQE_20220406210738: epoch=9 step=7902 epoch=9 metrics={'time_sample_batch': 0.0001123820459109504, 'time_algorithm_update': 0.0013704881038100953, 'loss': 0.033631655646278805, 'time_step': 0.0015326002464207539, 'init_value': -1.8400720357894897, 'ave_value': -1.8403201358022534, 'soft_opc': nan} step=7902




2022-04-06 21:07.52 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_7902.pt


Epoch 10/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.54 [info     ] FQE_20220406210738: epoch=10 step=8780 epoch=10 metrics={'time_sample_batch': 0.00011328276849281815, 'time_algorithm_update': 0.001355024568040713, 'loss': 0.03863802308645608, 'time_step': 0.0015188390410299454, 'init_value': -1.955252766609192, 'ave_value': -1.9554760425900855, 'soft_opc': nan} step=8780




2022-04-06 21:07.54 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_8780.pt


Epoch 11/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.56 [info     ] FQE_20220406210738: epoch=11 step=9658 epoch=11 metrics={'time_sample_batch': 0.00010978822556063363, 'time_algorithm_update': 0.0013943576595506256, 'loss': 0.04161569715207885, 'time_step': 0.0015515971563943155, 'init_value': -1.9574847221374512, 'ave_value': -1.9577400242768805, 'soft_opc': nan} step=9658




2022-04-06 21:07.56 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_9658.pt


Epoch 12/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.57 [info     ] FQE_20220406210738: epoch=12 step=10536 epoch=12 metrics={'time_sample_batch': 0.00011433637223645604, 'time_algorithm_update': 0.0014855679727089433, 'loss': 0.04204821414470751, 'time_step': 0.0016498649853508673, 'init_value': -2.0717697143554688, 'ave_value': -2.072038046663674, 'soft_opc': nan} step=10536




2022-04-06 21:07.57 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_10536.pt


Epoch 13/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:07.59 [info     ] FQE_20220406210738: epoch=13 step=11414 epoch=13 metrics={'time_sample_batch': 0.0001103019931593354, 'time_algorithm_update': 0.00134489232152361, 'loss': 0.0431493878667982, 'time_step': 0.001504068494114626, 'init_value': -1.9922022819519043, 'ave_value': -1.9924882210569386, 'soft_opc': nan} step=11414




2022-04-06 21:07.59 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_11414.pt


Epoch 14/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.00 [info     ] FQE_20220406210738: epoch=14 step=12292 epoch=14 metrics={'time_sample_batch': 0.00011931790849342433, 'time_algorithm_update': 0.0014310920428578023, 'loss': 0.044116638565871835, 'time_step': 0.0016057477725122404, 'init_value': -2.0568687915802, 'ave_value': -2.0571716389654124, 'soft_opc': nan} step=12292




2022-04-06 21:08.00 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_12292.pt


Epoch 15/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.02 [info     ] FQE_20220406210738: epoch=15 step=13170 epoch=15 metrics={'time_sample_batch': 0.00011035575953594373, 'time_algorithm_update': 0.001312759036626794, 'loss': 0.04706477951954278, 'time_step': 0.0014730751650208493, 'init_value': -2.131348133087158, 'ave_value': -2.13165755924088, 'soft_opc': nan} step=13170




2022-04-06 21:08.02 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_13170.pt


Epoch 16/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.04 [info     ] FQE_20220406210738: epoch=16 step=14048 epoch=16 metrics={'time_sample_batch': 0.00011658695672802067, 'time_algorithm_update': 0.0014514624393610856, 'loss': 0.04689963183631094, 'time_step': 0.0016213943311999763, 'init_value': -2.1570379734039307, 'ave_value': -2.1573657027868474, 'soft_opc': nan} step=14048




2022-04-06 21:08.04 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_14048.pt


Epoch 17/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.05 [info     ] FQE_20220406210738: epoch=17 step=14926 epoch=17 metrics={'time_sample_batch': 0.00011361405626787954, 'time_algorithm_update': 0.001349180325831802, 'loss': 0.04980500104289119, 'time_step': 0.0015144644131149952, 'init_value': -2.1130576133728027, 'ave_value': -2.113394008915229, 'soft_opc': nan} step=14926




2022-04-06 21:08.05 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_14926.pt


Epoch 18/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.07 [info     ] FQE_20220406210738: epoch=18 step=15804 epoch=18 metrics={'time_sample_batch': 0.0001136686372865577, 'time_algorithm_update': 0.0013884631810807421, 'loss': 0.047973340375674335, 'time_step': 0.001553461600544784, 'init_value': -2.096942901611328, 'ave_value': -2.097263014731796, 'soft_opc': nan} step=15804




2022-04-06 21:08.07 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_15804.pt


Epoch 19/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.08 [info     ] FQE_20220406210738: epoch=19 step=16682 epoch=19 metrics={'time_sample_batch': 0.00010638600872969573, 'time_algorithm_update': 0.001343217145880697, 'loss': 0.04587532281870887, 'time_step': 0.001497316740639932, 'init_value': -2.045405626296997, 'ave_value': -2.0457155897969725, 'soft_opc': nan} step=16682




2022-04-06 21:08.08 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_16682.pt


Epoch 20/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.10 [info     ] FQE_20220406210738: epoch=20 step=17560 epoch=20 metrics={'time_sample_batch': 0.00010427391339000103, 'time_algorithm_update': 0.0013430762128026177, 'loss': 0.04700227192249231, 'time_step': 0.0014925182273013173, 'init_value': -2.10949444770813, 'ave_value': -2.1098272289633453, 'soft_opc': nan} step=17560




2022-04-06 21:08.10 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_17560.pt


Epoch 21/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.11 [info     ] FQE_20220406210738: epoch=21 step=18438 epoch=21 metrics={'time_sample_batch': 0.00010523220401146961, 'time_algorithm_update': 0.001311143601402335, 'loss': 0.04952541519700377, 'time_step': 0.001462481287997513, 'init_value': -2.2521586418151855, 'ave_value': -2.2525230894789843, 'soft_opc': nan} step=18438




2022-04-06 21:08.11 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_18438.pt


Epoch 22/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.13 [info     ] FQE_20220406210738: epoch=22 step=19316 epoch=22 metrics={'time_sample_batch': 0.00010259656536823525, 'time_algorithm_update': 0.0013295724343058732, 'loss': 0.052713522681173394, 'time_step': 0.0014780056503747754, 'init_value': -2.261894941329956, 'ave_value': -2.26227576449043, 'soft_opc': nan} step=19316




2022-04-06 21:08.13 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_19316.pt


Epoch 23/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.14 [info     ] FQE_20220406210738: epoch=23 step=20194 epoch=23 metrics={'time_sample_batch': 0.00010387419568107448, 'time_algorithm_update': 0.0013150424783485081, 'loss': 0.05562426671948533, 'time_step': 0.0014645741034748886, 'init_value': -2.2431588172912598, 'ave_value': -2.2435078118198097, 'soft_opc': nan} step=20194




2022-04-06 21:08.14 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_20194.pt


Epoch 24/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.16 [info     ] FQE_20220406210738: epoch=24 step=21072 epoch=24 metrics={'time_sample_batch': 0.00010560015067967304, 'time_algorithm_update': 0.0013548401873855764, 'loss': 0.055636593598686564, 'time_step': 0.0015070014771133458, 'init_value': -2.291595935821533, 'ave_value': -2.2919542392904884, 'soft_opc': nan} step=21072




2022-04-06 21:08.16 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_21072.pt


Epoch 25/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.17 [info     ] FQE_20220406210738: epoch=25 step=21950 epoch=25 metrics={'time_sample_batch': 0.00010742956522113913, 'time_algorithm_update': 0.0013616448925948088, 'loss': 0.05705685995718529, 'time_step': 0.0015151870006309283, 'init_value': -2.289217233657837, 'ave_value': -2.2895522486036626, 'soft_opc': nan} step=21950




2022-04-06 21:08.17 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_21950.pt


Epoch 26/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.19 [info     ] FQE_20220406210738: epoch=26 step=22828 epoch=26 metrics={'time_sample_batch': 0.00010597841614742757, 'time_algorithm_update': 0.0013507908732038425, 'loss': 0.05373092050016865, 'time_step': 0.0015030610534216113, 'init_value': -2.2778213024139404, 'ave_value': -2.278166347970758, 'soft_opc': nan} step=22828




2022-04-06 21:08.19 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_22828.pt


Epoch 27/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.21 [info     ] FQE_20220406210738: epoch=27 step=23706 epoch=27 metrics={'time_sample_batch': 0.00010707030806834713, 'time_algorithm_update': 0.0013497163603137458, 'loss': 0.05490457079743181, 'time_step': 0.00150424988774884, 'init_value': -2.276827812194824, 'ave_value': -2.277165536118703, 'soft_opc': nan} step=23706




2022-04-06 21:08.21 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_23706.pt


Epoch 28/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.22 [info     ] FQE_20220406210738: epoch=28 step=24584 epoch=28 metrics={'time_sample_batch': 0.00010456501215628448, 'time_algorithm_update': 0.0013566896964314315, 'loss': 0.05571037207755373, 'time_step': 0.0015068075923007278, 'init_value': -2.3336145877838135, 'ave_value': -2.3339579961986416, 'soft_opc': nan} step=24584




2022-04-06 21:08.22 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_24584.pt


Epoch 29/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.24 [info     ] FQE_20220406210738: epoch=29 step=25462 epoch=29 metrics={'time_sample_batch': 0.00010177540616185356, 'time_algorithm_update': 0.0012858924127113847, 'loss': 0.060448611191643985, 'time_step': 0.0014320188339859044, 'init_value': -2.4159975051879883, 'ave_value': -2.416374315336227, 'soft_opc': nan} step=25462




2022-04-06 21:08.24 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_25462.pt


Epoch 30/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.25 [info     ] FQE_20220406210738: epoch=30 step=26340 epoch=30 metrics={'time_sample_batch': 0.00010422177629753235, 'time_algorithm_update': 0.0013066391738509263, 'loss': 0.06361026641257955, 'time_step': 0.0014565925120221183, 'init_value': -2.415923595428467, 'ave_value': -2.4162769447705847, 'soft_opc': nan} step=26340




2022-04-06 21:08.25 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_26340.pt


Epoch 31/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.27 [info     ] FQE_20220406210738: epoch=31 step=27218 epoch=31 metrics={'time_sample_batch': 0.00010630318678593038, 'time_algorithm_update': 0.0013834596494878887, 'loss': 0.06538178606461854, 'time_step': 0.0015367310248094702, 'init_value': -2.465179681777954, 'ave_value': -2.4655199818525397, 'soft_opc': nan} step=27218




2022-04-06 21:08.27 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_27218.pt


Epoch 32/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.28 [info     ] FQE_20220406210738: epoch=32 step=28096 epoch=32 metrics={'time_sample_batch': 0.00010201002307796262, 'time_algorithm_update': 0.0012140200730065278, 'loss': 0.0663549472095991, 'time_step': 0.0013615368167468791, 'init_value': -2.4906489849090576, 'ave_value': -2.4910368365956783, 'soft_opc': nan} step=28096




2022-04-06 21:08.28 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_28096.pt


Epoch 33/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.29 [info     ] FQE_20220406210738: epoch=33 step=28974 epoch=33 metrics={'time_sample_batch': 0.000101431084113675, 'time_algorithm_update': 0.0012694874222566434, 'loss': 0.06671217458582705, 'time_step': 0.0014155595340598418, 'init_value': -2.524453639984131, 'ave_value': -2.524863601227669, 'soft_opc': nan} step=28974




2022-04-06 21:08.29 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_28974.pt


Epoch 34/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.31 [info     ] FQE_20220406210738: epoch=34 step=29852 epoch=34 metrics={'time_sample_batch': 0.00010597488603179166, 'time_algorithm_update': 0.0013537173390660035, 'loss': 0.06453512905011316, 'time_step': 0.0015060187472297824, 'init_value': -2.4035935401916504, 'ave_value': -2.4039949064383923, 'soft_opc': nan} step=29852




2022-04-06 21:08.31 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_29852.pt


Epoch 35/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.33 [info     ] FQE_20220406210738: epoch=35 step=30730 epoch=35 metrics={'time_sample_batch': 0.00010949413977342748, 'time_algorithm_update': 0.00138328422989552, 'loss': 0.0609761889832841, 'time_step': 0.0015412235042771881, 'init_value': -2.4128823280334473, 'ave_value': -2.4132125466534173, 'soft_opc': nan} step=30730




2022-04-06 21:08.33 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_30730.pt


Epoch 36/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.34 [info     ] FQE_20220406210738: epoch=36 step=31608 epoch=36 metrics={'time_sample_batch': 0.00012033403870185035, 'time_algorithm_update': 0.0015147066333570892, 'loss': 0.06328373947955307, 'time_step': 0.0016907263453837678, 'init_value': -2.3309714794158936, 'ave_value': -2.3313020162080873, 'soft_opc': nan} step=31608




2022-04-06 21:08.34 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_31608.pt


Epoch 37/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.36 [info     ] FQE_20220406210738: epoch=37 step=32486 epoch=37 metrics={'time_sample_batch': 0.00011358988855314146, 'time_algorithm_update': 0.0014304585228748364, 'loss': 0.0592149725890234, 'time_step': 0.0015990611904031324, 'init_value': -2.2976889610290527, 'ave_value': -2.297997024491596, 'soft_opc': nan} step=32486




2022-04-06 21:08.36 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_32486.pt


Epoch 38/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.38 [info     ] FQE_20220406210738: epoch=38 step=33364 epoch=38 metrics={'time_sample_batch': 0.00011191199743666247, 'time_algorithm_update': 0.00139888082386964, 'loss': 0.05836308227111314, 'time_step': 0.0015636427255860765, 'init_value': -2.306408166885376, 'ave_value': -2.30673457320015, 'soft_opc': nan} step=33364




2022-04-06 21:08.38 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_33364.pt


Epoch 39/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.39 [info     ] FQE_20220406210738: epoch=39 step=34242 epoch=39 metrics={'time_sample_batch': 0.0001117224573817503, 'time_algorithm_update': 0.0013509119833248894, 'loss': 0.05845025653407759, 'time_step': 0.001513544682218165, 'init_value': -2.3161301612854004, 'ave_value': -2.3164143079799246, 'soft_opc': nan} step=34242




2022-04-06 21:08.39 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_34242.pt


Epoch 40/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.41 [info     ] FQE_20220406210738: epoch=40 step=35120 epoch=40 metrics={'time_sample_batch': 0.00011011652631477234, 'time_algorithm_update': 0.0013012454287066274, 'loss': 0.06322086862179557, 'time_step': 0.0014624047116429497, 'init_value': -2.340385675430298, 'ave_value': -2.340653276380845, 'soft_opc': nan} step=35120




2022-04-06 21:08.41 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_35120.pt


Epoch 41/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.42 [info     ] FQE_20220406210738: epoch=41 step=35998 epoch=41 metrics={'time_sample_batch': 0.00010735733362428148, 'time_algorithm_update': 0.00128861603269816, 'loss': 0.060837121938646405, 'time_step': 0.0014490299181405938, 'init_value': -2.294942617416382, 'ave_value': -2.295185546401554, 'soft_opc': nan} step=35998




2022-04-06 21:08.42 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_35998.pt


Epoch 42/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.44 [info     ] FQE_20220406210738: epoch=42 step=36876 epoch=42 metrics={'time_sample_batch': 0.0001071995646100924, 'time_algorithm_update': 0.0013039918586713578, 'loss': 0.05587460597967463, 'time_step': 0.0014607026528117325, 'init_value': -2.182204246520996, 'ave_value': -2.1824405178587662, 'soft_opc': nan} step=36876




2022-04-06 21:08.44 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_36876.pt


Epoch 43/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.45 [info     ] FQE_20220406210738: epoch=43 step=37754 epoch=43 metrics={'time_sample_batch': 0.00011310001712182117, 'time_algorithm_update': 0.0013861878857797261, 'loss': 0.05388600273942253, 'time_step': 0.001552787348458327, 'init_value': -2.2282960414886475, 'ave_value': -2.2285497464115065, 'soft_opc': nan} step=37754




2022-04-06 21:08.45 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_37754.pt


Epoch 44/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.47 [info     ] FQE_20220406210738: epoch=44 step=38632 epoch=44 metrics={'time_sample_batch': 0.00010920467029128368, 'time_algorithm_update': 0.0012863673490380915, 'loss': 0.05693548109269787, 'time_step': 0.0014463008672066864, 'init_value': -2.2982282638549805, 'ave_value': -2.2984854481642176, 'soft_opc': nan} step=38632




2022-04-06 21:08.47 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_38632.pt


Epoch 45/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.48 [info     ] FQE_20220406210738: epoch=45 step=39510 epoch=45 metrics={'time_sample_batch': 0.00010946562730098368, 'time_algorithm_update': 0.001297217838313422, 'loss': 0.05836378771983745, 'time_step': 0.001458410521574607, 'init_value': -2.2752180099487305, 'ave_value': -2.2754564463522584, 'soft_opc': nan} step=39510




2022-04-06 21:08.48 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_39510.pt


Epoch 46/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.50 [info     ] FQE_20220406210738: epoch=46 step=40388 epoch=46 metrics={'time_sample_batch': 0.00010861215395916567, 'time_algorithm_update': 0.0013121027066658733, 'loss': 0.057938751499081065, 'time_step': 0.001471980286079007, 'init_value': -2.230545997619629, 'ave_value': -2.230792660452282, 'soft_opc': nan} step=40388




2022-04-06 21:08.50 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_40388.pt


Epoch 47/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.51 [info     ] FQE_20220406210738: epoch=47 step=41266 epoch=47 metrics={'time_sample_batch': 0.00010930704364472479, 'time_algorithm_update': 0.0013023905439094423, 'loss': 0.056052607877875686, 'time_step': 0.0014628432606238711, 'init_value': -2.2286648750305176, 'ave_value': -2.2289251997164476, 'soft_opc': nan} step=41266




2022-04-06 21:08.51 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_41266.pt


Epoch 48/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.53 [info     ] FQE_20220406210738: epoch=48 step=42144 epoch=48 metrics={'time_sample_batch': 0.00010815052345293258, 'time_algorithm_update': 0.0012553479513980805, 'loss': 0.05380717061232806, 'time_step': 0.0014122548027299259, 'init_value': -2.174395799636841, 'ave_value': -2.174636870950667, 'soft_opc': nan} step=42144




2022-04-06 21:08.53 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_42144.pt


Epoch 49/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.54 [info     ] FQE_20220406210738: epoch=49 step=43022 epoch=49 metrics={'time_sample_batch': 0.00010769350925176181, 'time_algorithm_update': 0.0012785994653549716, 'loss': 0.05550557912277064, 'time_step': 0.0014369514917186832, 'init_value': -2.1512370109558105, 'ave_value': -2.151479514908595, 'soft_opc': nan} step=43022




2022-04-06 21:08.54 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_43022.pt


Epoch 50/50:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-06 21:08.56 [info     ] FQE_20220406210738: epoch=50 step=43900 epoch=50 metrics={'time_sample_batch': 0.00011088554842868535, 'time_algorithm_update': 0.0013027755980611119, 'loss': 0.05379281517193776, 'time_step': 0.0014647383896256362, 'init_value': -2.247028350830078, 'ave_value': -2.247287266403565, 'soft_opc': nan} step=43900




2022-04-06 21:08.56 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406210738/model_43900.pt


[(1,
  {'time_sample_batch': 0.00011441077621216655,
   'time_algorithm_update': 0.001423091714637426,
   'loss': 0.0012160921468460657,
   'time_step': 0.0015893791694032847,
   'init_value': -0.48166346549987793,
   'ave_value': -0.48177142936358197,
   'soft_opc': nan}),
 (2,
  {'time_sample_batch': 0.00011742630960729508,
   'time_algorithm_update': 0.0014501519518180968,
   'loss': 0.004580605509955626,
   'time_step': 0.001620275827638109,
   'init_value': -0.8132042288780212,
   'ave_value': -0.813346434212975,
   'soft_opc': nan}),
 (3,
  {'time_sample_batch': 0.00010611663375194088,
   'time_algorithm_update': 0.0013551312861518599,
   'loss': 0.010369343410954709,
   'time_step': 0.001508662532293715,
   'init_value': -1.1891367435455322,
   'ave_value': -1.1893279653407203,
   'soft_opc': nan}),
 (4,
  {'time_sample_batch': 0.00010223975214165272,
   'time_algorithm_update': 0.0012770095555820335,
   'loss': 0.016514954606140823,
   'time_step': 0.0014186293769262919,
   'in