# Sample Workflow for d3rlpy Experiments

In [1]:
!pip install d3rlpy



In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import math
import subprocess
import os
import d3rlpy
# plt.style.use('matplotlibrc')

# from Python.data_sampler import *

## Building an MDPDataset

We first read in a large batch of samples from the file. As `d3rlpy` wants it in the form (observations, actions, rewards, terminal flags), we go ahead and do that. Here's a helper function to get a dataset from a list of chunks of your choosing.

In [3]:
import numpy as np
import torch
import random
import pandas

class DataSampler:
    def __init__(self, path_to_data="../collected_data/rl_deterministic.txt"):
        self.path_to_data = path_to_data
        self.chunk_length = 27771
        self.max_chunk = 1e4
        self.skip_header = 6
        self.skip_footer = 1
        self.num_chunks_read = 0
        self.data = None
        
    def use_file(self, filename):
        self.path_to_data = filename
        
    def use_chunk(self, num):
        self.num_chunks_read = num
        
    def read_chunk(self):
        if self.num_chunks_read >= self.max_chunk:
            return None
        
        with open(self.path_to_data, "r") as input:
            # input.seek(max(11664000*self.num_chunks_read - 10000, 0))
            
            count = 0
            line = input.readline()
            while line[0:11] != "Value F10.7":
                line = input.readline()
                count += 1

            for i in range(self.skip_header-1):
                input.readline()

            # print("chkpt1")
            # input()
              
            self.data = pandas.read_csv(input, header=None, skiprows=0, 
                                        usecols=range(11), dtype=np.float64, 
                                        nrows=self.chunk_length).to_numpy()
            
            self.num_chunks_read += 1
            print(self.data[0])
            
            print("Read chunk #", self.num_chunks_read, "out of", int(self.max_chunk))
        
    def get_batch(self, batch_size):
        if batch_size > self.chunk_length-1:
            batch_size = self.chunk_length-1
        
        # print("+++++++++++++++")
        # input()

        #batch = random.sample(range(self.chunk_length-1), batch_size)
        batch = [i for i in range(batch_size)]
        # print(self.data[batch])
        
        state_batch = torch.Tensor(self.data[batch,2:8])
        next_state_batch = torch.Tensor(self.data[[x+1 for x in batch],2:8])
        action_batch = torch.Tensor(self.data[batch,8:11])
        
        

        # Reward is negative of the cost
        # Cost is the wheel momentum after the action is applied
        # Should we instead make the reward the change in momentum between steps?
        reward_batch = torch.Tensor(np.vstack([y for y in -np.linalg.norm(self.data[[x+1 for x in batch],2:4], axis=1)]))
        
        return state_batch, action_batch, reward_batch, next_state_batch

In [4]:
def get_dataset(chunks : list, batch_size=30000, 
                path="collected_data/rl_det_small.txt") -> d3rlpy.dataset.MDPDataset :
    random.seed(0)
    samples = DataSampler(path_to_data=path)
    states = []
    actions = []
    rewards = []
    next_states = []
    for chunk in chunks:
        samples.use_chunk(chunk)
        samples.read_chunk()
        [statesChunk, actionsChunk, rewardsChunk, nextStatesChunk] = samples.get_batch(batch_size)
        states.append(statesChunk)
        actions.append(actionsChunk)
        rewards.append(rewardsChunk)
        next_states.append(nextStatesChunk)
    states = torch.cat(states)
    actions = torch.cat(actions)
    rewards = torch.cat(rewards)
    next_states = torch.cat(next_states)
    terminals = np.zeros(len(states))
    terminals[::100] = 1 #episode length 100, change if necessary
    print(states.shape)
    dataset = d3rlpy.dataset.MDPDataset(states.numpy(), 
                                        actions.numpy(), 
                                        rewards.numpy(), terminals)
    return dataset, states.numpy(), actions.numpy(), rewards.numpy()

We can build the dataset from there, just like this, and split into train and test sets.

In [5]:
dataset, states, actions, rewards = get_dataset([3,5,7,9], path="rl_stoch_small.txt")

[ 0.00000000e+00  7.95731469e+08 -1.02189108e-01 -1.57999953e-02
 -9.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.03154890e-01 -2.08087043e-01  6.00000000e-01]
Read chunk # 4 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.02189108e-01 -1.57999953e-02
 -9.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.03154890e-01 -2.08087043e-01  6.00000000e-01]
Read chunk # 6 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.02189108e-01 -1.57999953e-02
 -9.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.03154890e-01 -2.08087043e-01  6.00000000e-01]
Read chunk # 8 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.02189108e-01 -1.57999953e-02
 -9.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.03154890e-01 -2.08087043e-01  6.00000000e-01]
Read chunk # 10 out of 10000
torch.Size([111080, 6])


In [6]:
print("The behavior policy value statistics are:")
dataset.compute_stats()['return']

The behavior policy value statistics are:


{'histogram': (array([ 12,  11,  11,  10,  10,   9,   9,   9,   8,   9,   8,  51,  69,
          57,  61, 293, 173, 157, 143,   1]),
  array([-10.137507 ,  -9.630632 ,  -9.123756 ,  -8.616881 ,  -8.110006 ,
          -7.6031303,  -7.0962553,  -6.58938  ,  -6.0825043,  -5.575629 ,
          -5.0687537,  -4.561878 ,  -4.055003 ,  -3.5481277,  -3.0412521,
          -2.5343769,  -2.0275016,  -1.5206261,  -1.0137508,  -0.5068754,
           0.       ], dtype=float32)),
 'max': 0.0,
 'mean': -2.6170745,
 'min': -10.137507,
 'std': 1.9108772}

In [7]:
from sklearn.model_selection import train_test_split
train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

## Setting up an Algorithm

In [8]:
from d3rlpy.algos import CQL
from d3rlpy.models.encoders import VectorEncoderFactory

from d3rlpy.preprocessing import MinMaxActionScaler
import random
action_scaler = MinMaxActionScaler(minimum=-0.6, maximum=0.6)

from d3rlpy.metrics.scorer import td_error_scorer
from d3rlpy.metrics.scorer import average_value_estimation_scorer
from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer

from d3rlpy.ope import FQE
# metrics to evaluate with
from d3rlpy.metrics.scorer import soft_opc_scorer
import pickle as pkl
from statistics import harmonic_mean as hm

## Perform a random search on hyper params. 

As good fqe on both stochastic and deterministic data is needed for good performance during the evaluation step in the simulator, we take the harmonic mean of both fqes to understand the performance on each dataset. The harmonic mean is large only when each number whose mean is being taken is large. Therefore, a large harmonic mean corresponds to large numbers whose mean is being taken.


In [15]:
random.uniform(1e-5, 1e-2)

0.002596578335426704

In [18]:
random.seed()
random.uniform(1e-5, 1e-2)

0.00250011718580462

In [20]:

# The number of times we would wanna do a random search for hyper-params. In 
# every random search, we randomly pick a new set of hyper-params.
num_search_iterations = 4
largest_fqe = -np.inf

for i in range(num_search_iterations):

    random.seed()

    actor_lr_this_iter = random.uniform(1e-5, 1e-2)
    critic_lr_this_iter = random.uniform(1e-5, 1e-2)
    temp_lr_this_iter = random.uniform(1e-5, 1e-4)
    n_steps_this_iter = random.choice([1, 3, 5, 7])

    actor_encoder = VectorEncoderFactory(hidden_units=[12, 24, 36, 24, 12],
                                          activation='relu', use_batch_norm=True, dropout_rate=0.2)
    critic_encoder = VectorEncoderFactory(hidden_units=[12, 24, 24, 12],
                                          activation='relu', use_batch_norm=True, dropout_rate=0.2)

    print("search iteration: ", i)
    print("using hyper params: ", [actor_lr_this_iter, critic_lr_this_iter, 
                                   temp_lr_this_iter, n_steps_this_iter])

    model = CQL(q_func_factory='qr', #qr -> quantile regression q function, but you don't have to use this
                reward_scaler='standard',
                actor_encoder_factory = actor_encoder,
                critic_encoder_factory = critic_encoder,
                action_scaler=action_scaler,
                actor_learning_rate=actor_lr_this_iter, 
                critic_learning_rate=critic_lr_this_iter,
                temp_learning_rate=temp_lr_this_iter,
                n_steps=n_steps_this_iter, 
                use_gpu=False) #change it to true if you have one
    model.build_with_dataset(dataset)

    model.fit(train_episodes,
        eval_episodes=test_episodes,
        n_epochs=1, 
        tensorboard_dir='runs',
        scorers={
            'td_error': td_error_scorer,
            'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer
        })
    
    ope_dataset, states_ope, actions_ope, rewards_ope = get_dataset([2,4,6,8], 
                                                                    path="rl_stoch_small.txt") #change if you'd prefer different chunks
    ope_train_episodes, ope_test_episodes = train_test_split(ope_dataset, test_size=0.2)

    fqe = FQE(algo=model, action_scaler = action_scaler, use_gpu=False) #change this if you have one!
    history_stoch = fqe.fit(ope_train_episodes,
        eval_episodes=ope_test_episodes,
        tensorboard_dir='runs',
        n_epochs=1, n_steps_per_epoch=1000, #change if overfitting/underfitting
        scorers={
           'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer,
           'soft_opc': soft_opc_scorer(return_threshold=0)
        })
    
    ope_dataset, states_ope, actions_ope, rewards_ope = get_dataset([2,4,6,8], 
                                                                    path="rl_det_small.txt") #change if you'd prefer different chunks
    ope_train_episodes, ope_test_episodes = train_test_split(ope_dataset, test_size=0.2)

    fqe = FQE(algo=model, action_scaler = action_scaler, use_gpu=False) #change this if you have one!
    history_det = fqe.fit(ope_train_episodes,
        eval_episodes=ope_test_episodes,
        tensorboard_dir='runs',
        n_epochs=1, n_steps_per_epoch=1000, #change if overfitting/underfitting
        scorers={
           'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer,
           'soft_opc': soft_opc_scorer(return_threshold=0)
        })
    
    if(hm([history_stoch[-1][1]["ave_value"] + 50, 
           history_det[-1][1]["ave_value"] + 50]) \
                                                          > largest_fqe):
        largest_fqe = hm([history_stoch[-1][1]["ave_value"] + 50, 
                         history_det[-1][1]["ave_value"] + 50])

        # Save the hyper-params
        hyperparams = [actor_lr_this_iter, critic_lr_this_iter, 
                       temp_lr_this_iter, n_steps_this_iter]

        with open("hyperparams.pkl", "wb") as f:
            print("most optimal hyper params at this point: ", hyperparams)
            pkl.dump(hyperparams, f)

        # Save model and policy
        model.save_model("model.pt")
        model.save_policy("policy.pt")


search iteration:  4
using hyper params:  [0.00837147710783231, 0.004792173874852329, 6.42695771320783e-05, 3]
2022-04-17 15:14.57 [debug    ] RoundIterator is selected.
2022-04-17 15:14.57 [info     ] Directory is created at d3rlpy_logs/CQL_20220417151457
2022-04-17 15:14.57 [debug    ] Fitting action scaler...       action_scaler=min_max
2022-04-17 15:14.57 [debug    ] Fitting reward scaler...       reward_scaler=standard
2022-04-17 15:14.57 [info     ] Parameters are saved to d3rlpy_logs/CQL_20220417151457/params.json params={'action_scaler': {'type': 'min_max', 'params': {'minimum': array(-0.6), 'maximum': array(0.6)}}, 'actor_encoder_factory': {'type': 'vector', 'params': {'hidden_units': [12, 24, 36, 24, 12], 'activation': 'relu', 'use_batch_norm': True, 'dropout_rate': 0.2, 'use_dense': False}}, 'actor_learning_rate': 0.00837147710783231, 'actor_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'alpha_learning_rate':

Epoch 1/1:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-17 15:15.49 [info     ] CQL_20220417151457: epoch=1 step=343 epoch=1 metrics={'time_sample_batch': 0.0012760356980927136, 'time_algorithm_update': 0.13685920287151726, 'temp_loss': 4.767822511689656, 'temp': 0.9888826052579518, 'alpha_loss': -18.729393233363204, 'alpha': 1.0176793625333913, 'critic_loss': 53.26442982086982, 'actor_loss': 1.0867714058097355, 'time_step': 0.13837417708193944, 'td_error': 0.7151478993149399, 'init_value': -3.7319533824920654, 'ave_value': -3.7427080396282535} step=343
2022-04-17 15:15.49 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220417151457/model_343.pt
[ 0.00000000e+00  7.95731469e+08 -1.02189108e-01 -1.57999953e-02
 -9.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.03154890e-01 -2.08087043e-01  6.00000000e-01]
Read chunk # 3 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.02189108e-01 -1.57999953e-02
 -9.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.03154890e-01 -2.08087043e-01  6.00000000

Epoch 1/1:   0%|          | 0/878 [00:00<?, ?it/s]

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


2022-04-17 15:15.57 [info     ] FQE_20220417151550: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.00044516224524426297, 'time_algorithm_update': 0.00571145473688773, 'loss': 0.00023512639282207372, 'time_step': 0.0063249923645228075, 'init_value': -0.22288371622562408, 'ave_value': -0.22289248926149696, 'soft_opc': nan} step=878




2022-04-17 15:15.57 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220417151550/model_878.pt
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 3 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 5 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 7 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 9 out of 10000
torch.Size([111080, 6])
2022-04-17 15:15.58 [debug    ] RoundI

Epoch 1/1:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-17 15:16.05 [info     ] FQE_20220417151558: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.0004264667528365359, 'time_algorithm_update': 0.005580979219058651, 'loss': 0.00034580096185282216, 'time_step': 0.006191527653392192, 'init_value': -0.24633125960826874, 'ave_value': -0.24635850570283627, 'soft_opc': nan} step=878




2022-04-17 15:16.05 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220417151558/model_878.pt
most optimal hyper params at this point:  [0.00837147710783231, 0.004792173874852329, 6.42695771320783e-05, 3]
search iteration:  4
using hyper params:  [0.009720418816804455, 0.00828843675347236, 2.4062391869302247e-05, 7]
2022-04-17 15:16.05 [debug    ] RoundIterator is selected.
2022-04-17 15:16.05 [info     ] Directory is created at d3rlpy_logs/CQL_20220417151605
2022-04-17 15:16.05 [debug    ] Fitting action scaler...       action_scaler=min_max
2022-04-17 15:16.05 [debug    ] Fitting reward scaler...       reward_scaler=standard
2022-04-17 15:16.05 [info     ] Parameters are saved to d3rlpy_logs/CQL_20220417151605/params.json params={'action_scaler': {'type': 'min_max', 'params': {'minimum': array(-0.6), 'maximum': array(0.6)}}, 'actor_encoder_factory': {'type': 'vector', 'params': {'hidden_units': [12, 24, 36, 24, 12], 'activation': 'relu', 'use_batch_norm': True, 'dropout_r

  self._minimum, dtype=torch.float32, device=action.device
  self._maximum, dtype=torch.float32, device=action.device


Epoch 1/1:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-17 15:17.00 [info     ] CQL_20220417151605: epoch=1 step=343 epoch=1 metrics={'time_sample_batch': 0.0016251924086590202, 'time_algorithm_update': 0.14510571435311098, 'temp_loss': 4.6845456942177375, 'temp': 0.9958833676733011, 'alpha_loss': -19.15002280699616, 'alpha': 1.0178638467288226, 'critic_loss': 121.8733302080249, 'actor_loss': 5.664353396939032, 'time_step': 0.1469663614434334, 'td_error': 1.9428105171455898, 'init_value': -9.899381637573242, 'ave_value': -9.919781594894802} step=343
2022-04-17 15:17.00 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220417151605/model_343.pt
[ 0.00000000e+00  7.95731469e+08 -1.02189108e-01 -1.57999953e-02
 -9.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.03154890e-01 -2.08087043e-01  6.00000000e-01]
Read chunk # 3 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.02189108e-01 -1.57999953e-02
 -9.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.03154890e-01 -2.08087043e-01  6.00000000e-01

Epoch 1/1:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-17 15:17.08 [info     ] FQE_20220417151701: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.0004326648212511067, 'time_algorithm_update': 0.005641782473866108, 'loss': 0.00041492437765644306, 'time_step': 0.0062430855353492265, 'init_value': -0.24756956100463867, 'ave_value': -0.24717183136107637, 'soft_opc': nan} step=878




2022-04-17 15:17.08 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220417151701/model_878.pt
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 3 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 5 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 7 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 9 out of 10000
torch.Size([111080, 6])
2022-04-17 15:17.10 [debug    ] RoundI

Epoch 1/1:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-17 15:17.17 [info     ] FQE_20220417151710: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.000431174297810687, 'time_algorithm_update': 0.005608255064840468, 'loss': 0.00038815983922800703, 'time_step': 0.00620900634209496, 'init_value': -0.23698747158050537, 'ave_value': -0.2369814963343351, 'soft_opc': nan} step=878




2022-04-17 15:17.17 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220417151710/model_878.pt
search iteration:  4
using hyper params:  [0.004934167995982009, 0.0023272126682368083, 1.6749440998091484e-05, 5]
2022-04-17 15:17.17 [debug    ] RoundIterator is selected.
2022-04-17 15:17.17 [info     ] Directory is created at d3rlpy_logs/CQL_20220417151717
2022-04-17 15:17.17 [debug    ] Fitting action scaler...       action_scaler=min_max
2022-04-17 15:17.17 [debug    ] Fitting reward scaler...       reward_scaler=standard
2022-04-17 15:17.17 [info     ] Parameters are saved to d3rlpy_logs/CQL_20220417151717/params.json params={'action_scaler': {'type': 'min_max', 'params': {'minimum': array(-0.6), 'maximum': array(0.6)}}, 'actor_encoder_factory': {'type': 'vector', 'params': {'hidden_units': [12, 24, 36, 24, 12], 'activation': 'relu', 'use_batch_norm': True, 'dropout_rate': 0.2, 'use_dense': False}}, 'actor_learning_rate': 0.004934167995982009, 'actor_optim_factory': {'optim_

Epoch 1/1:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-17 15:18.11 [info     ] CQL_20220417151717: epoch=1 step=343 epoch=1 metrics={'time_sample_batch': 0.0018524178263049779, 'time_algorithm_update': 0.1451234741044114, 'temp_loss': 4.627502086211224, 'temp': 0.9970470825020148, 'alpha_loss': -18.035741961732203, 'alpha': 1.0176410848823303, 'critic_loss': 102.01967665335874, 'actor_loss': 1.565337285160497, 'time_step': 0.14725813087151976, 'td_error': 1.5950741135733075, 'init_value': -5.616433143615723, 'ave_value': -5.627752033616719} step=343
2022-04-17 15:18.11 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220417151717/model_343.pt
[ 0.00000000e+00  7.95731469e+08 -1.02189108e-01 -1.57999953e-02
 -9.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.03154890e-01 -2.08087043e-01  6.00000000e-01]
Read chunk # 3 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.02189108e-01 -1.57999953e-02
 -9.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.03154890e-01 -2.08087043e-01  6.00000000e-0

Epoch 1/1:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-17 15:18.20 [info     ] FQE_20220417151813: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.0005017513566245251, 'time_algorithm_update': 0.005714580518509641, 'loss': 0.00047962556897649587, 'time_step': 0.006422512080511905, 'init_value': 0.1391717940568924, 'ave_value': 0.1394928266916237, 'soft_opc': nan} step=878




2022-04-17 15:18.20 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220417151813/model_878.pt
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 3 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 5 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 7 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 9 out of 10000
torch.Size([111080, 6])
2022-04-17 15:18.22 [debug    ] RoundI

Epoch 1/1:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-17 15:18.29 [info     ] FQE_20220417151822: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.0004842528449647247, 'time_algorithm_update': 0.005786695350003948, 'loss': 0.00041754882081767306, 'time_step': 0.006462698373816279, 'init_value': -0.3383084237575531, 'ave_value': -0.3380438636023373, 'soft_opc': nan} step=878




2022-04-17 15:18.29 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220417151822/model_878.pt
most optimal hyper params at this point:  [0.004934167995982009, 0.0023272126682368083, 1.6749440998091484e-05, 5]
search iteration:  4
using hyper params:  [0.008463004104341984, 0.009747557379566708, 1.6155817654322474e-05, 3]
2022-04-17 15:18.29 [debug    ] RoundIterator is selected.
2022-04-17 15:18.29 [info     ] Directory is created at d3rlpy_logs/CQL_20220417151829
2022-04-17 15:18.29 [debug    ] Fitting action scaler...       action_scaler=min_max
2022-04-17 15:18.29 [debug    ] Fitting reward scaler...       reward_scaler=standard
2022-04-17 15:18.30 [info     ] Parameters are saved to d3rlpy_logs/CQL_20220417151829/params.json params={'action_scaler': {'type': 'min_max', 'params': {'minimum': array(-0.6), 'maximum': array(0.6)}}, 'actor_encoder_factory': {'type': 'vector', 'params': {'hidden_units': [12, 24, 36, 24, 12], 'activation': 'relu', 'use_batch_norm': True, 'drop

Epoch 1/1:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-17 15:19.22 [info     ] CQL_20220417151829: epoch=1 step=343 epoch=1 metrics={'time_sample_batch': 0.001719154352349373, 'time_algorithm_update': 0.13904099978788947, 'temp_loss': 4.898196720173338, 'temp': 0.9971782641577651, 'alpha_loss': -18.981402194187176, 'alpha': 1.0176388487523915, 'critic_loss': 51.72357648176632, 'actor_loss': 1.210087854339152, 'time_step': 0.14103433203071616, 'td_error': 0.7659492960300608, 'init_value': -3.80944561958313, 'ave_value': -3.8201003497982877} step=343
2022-04-17 15:19.22 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220417151829/model_343.pt
[ 0.00000000e+00  7.95731469e+08 -1.02189108e-01 -1.57999953e-02
 -9.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.03154890e-01 -2.08087043e-01  6.00000000e-01]
Read chunk # 3 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.02189108e-01 -1.57999953e-02
 -9.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.03154890e-01 -2.08087043e-01  6.00000000e-01

Epoch 1/1:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-17 15:19.31 [info     ] FQE_20220417151923: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.0005013744488935536, 'time_algorithm_update': 0.006022902718980655, 'loss': 0.0009315161375120323, 'time_step': 0.006725782685507946, 'init_value': -0.3687238395214081, 'ave_value': -0.3682915723937499, 'soft_opc': nan} step=878




2022-04-17 15:19.31 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220417151923/model_878.pt
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 3 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 5 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 7 out of 10000
[ 0.00000000e+00  7.95731469e+08 -7.69891077e-02  4.00000469e-03
  5.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.50295370e-01 -2.41931634e-01  6.00000000e-01]
Read chunk # 9 out of 10000
torch.Size([111080, 6])
2022-04-17 15:19.32 [debug    ] RoundI

Epoch 1/1:   0%|          | 0/878 [00:00<?, ?it/s]



2022-04-17 15:19.40 [info     ] FQE_20220417151932: epoch=1 step=878 epoch=1 metrics={'time_sample_batch': 0.0004805505683047353, 'time_algorithm_update': 0.005766336086942286, 'loss': 0.0011081282490178516, 'time_step': 0.006441688754835541, 'init_value': -0.45709580183029175, 'ave_value': -0.4571470281292888, 'soft_opc': nan} step=878




2022-04-17 15:19.40 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220417151932/model_878.pt


## Reading hyper params from file

In [21]:
with open("hyperparams.pkl", "rb") as f:
    data = pkl.load(f)

print(data)

[0.004934167995982009, 0.0023272126682368083, 1.6749440998091484e-05, 5]
