# Sample Workflow for d3rlpy Experiments

In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import math
import subprocess
import os
import d3rlpy
plt.style.use('matplotlibrc')

from Python.data_sampler import *

## Building an MDPDataset

We first read in a large batch of samples from the file. As `d3rlpy` wants it in the form (observations, actions, rewards, terminal flags), we go ahead and do that. Here's a helper function to get a dataset from a list of chunks of your choosing.

In [99]:
def get_dataset(chunks : list, batch_size=30000, 
                path="collected_data/rl_det_small.txt") -> d3rlpy.dataset.MDPDataset :
    random.seed(0)
    samples = DataSampler(path_to_data=path)
    states = []
    actions = []
    rewards = []
    next_states = []
    for chunk in chunks:
        samples.use_chunk(chunk)
        samples.read_chunk()
        [statesChunk, actionsChunk, rewardsChunk, nextStatesChunk] = samples.get_batch(batch_size)
        states.append(statesChunk)
        actions.append(actionsChunk)
        rewards.append(rewardsChunk)
        next_states.append(nextStatesChunk)
    states = torch.cat(states)
    actions = torch.cat(actions)
    rewards = torch.cat(rewards)
    next_states = torch.cat(next_states)
    terminals = np.zeros(len(states))
    terminals[::100] = 1 #episode length 100, change if necessary
    print(states.shape)
    dataset = d3rlpy.dataset.MDPDataset(states.numpy(), 
                                        actions.numpy(), 
                                        rewards.numpy(), terminals)
    return dataset

We can build the dataset from there, just like this, and split into train and test sets.

In [100]:
dataset = get_dataset([3,5,7,9])

[ 0.00000000e+00  7.95731469e+08 -8.17891077e-02 -1.19999531e-03
  7.39998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  2.09713430e-01 -2.63658359e-01  6.00000000e-01]
Read chunk # 4 out of 10000
[ 0.00000000e+00  7.95731469e+08  1.24610892e-01  2.40000469e-03
 -7.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.20016566e-01  3.79282423e-01 -6.00000000e-01]
Read chunk # 6 out of 10000
[ 0.00000000e+00  7.95731469e+08 -9.01891077e-02  1.08000047e-02
  3.99986580e-04  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -3.17973088e-02 -2.40776052e-01  6.00000000e-01]
Read chunk # 8 out of 10000
[ 0.00000000e+00  7.95731469e+08  6.91108923e-02 -5.99999531e-03
 -6.00001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -1.42355434e-01  2.22081792e-01 -6.00000000e-01]
Read chunk # 10 out of 10000
torch.Size([111080, 6])


In [117]:
print("The behavior policy value statistics are:")
dataset.compute_stats()['return']

The behavior policy value statistics are:


{'mean': -4.1227446,
 'std': 2.4676569,
 'min': -12.578855,
 'max': 0.0,
 'histogram': (array([ 26,   9,   7,   7,   8,   7,  10,  13,  27,  54,  56,  73, 109,
          84, 186, 148, 124,  83,  67,  13]),
  array([-12.578855 , -11.949912 , -11.320969 , -10.692026 , -10.063084 ,
          -9.434141 ,  -8.805199 ,  -8.176255 ,  -7.5473127,  -6.9183702,
          -6.2894273,  -5.6604843,  -5.031542 ,  -4.4025993,  -3.7736564,
          -3.1447136,  -2.515771 ,  -1.8868282,  -1.2578855,  -0.6289427,
           0.       ], dtype=float32))}

In [101]:
from sklearn.model_selection import train_test_split
train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

## Setting up an Algorithm

In [103]:
from d3rlpy.algos import CQL
model = CQL(q_func_factory='qr', #quantile regression q function, but you don't have to use this
            use_gpu=False) #change it to true if you have one
model.build_with_dataset(dataset)

In [104]:
from d3rlpy.metrics.scorer import td_error_scorer
from d3rlpy.metrics.scorer import average_value_estimation_scorer
from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer

# calculate metrics with test dataset
ave_error_init = average_value_estimation_scorer(model, test_episodes)
print(ave_error_init)

-0.037882303384637175


In [105]:
model.fit(train_episodes,
        eval_episodes=test_episodes,
        n_epochs=5,
        scorers={
            'td_error': td_error_scorer,
            'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer
        })

2022-04-06 18:44.47 [debug    ] RoundIterator is selected.
2022-04-06 18:44.47 [info     ] Directory is created at d3rlpy_logs/CQL_20220406184447
2022-04-06 18:44.47 [info     ] Parameters are saved to d3rlpy_logs/CQL_20220406184447/params.json params={'action_scaler': None, 'actor_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'actor_learning_rate': 0.0001, 'actor_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'alpha_learning_rate': 0.0001, 'alpha_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.999), 'eps': 1e-08, 'weight_decay': 0, 'amsgrad': False}, 'alpha_threshold': 10.0, 'batch_size': 256, 'conservative_weight': 5.0, 'critic_encoder_factory': {'type': 'default', 'params': {'activation': 'relu', 'use_batch_norm': False, 'dropout_rate': None}}, 'critic_learning_rate': 0.0003, 'critic_optim_factory': {'optim_cls': 'Adam', 'betas': (0.9, 0.

Epoch 1/5:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-06 18:45.19 [info     ] CQL_20220406184447: epoch=1 step=343 epoch=1 metrics={'time_sample_batch': 0.00034665783362207886, 'time_algorithm_update': 0.09301192836928299, 'temp_loss': 4.434645806387632, 'temp': 0.9835434387446145, 'alpha_loss': -10.887004488411172, 'alpha': 1.0149100758243927, 'critic_loss': 11.504611487291298, 'actor_loss': -0.4734545949420216, 'time_step': 0.09343911398951583, 'td_error': 0.23003137005684537, 'init_value': 0.358516663312912, 'ave_value': 0.3585621578675918} step=343
2022-04-06 18:45.19 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220406184447/model_343.pt


Epoch 2/5:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-06 18:45.51 [info     ] CQL_20220406184447: epoch=2 step=686 epoch=2 metrics={'time_sample_batch': 0.00030645267608909495, 'time_algorithm_update': 0.09020866944560504, 'temp_loss': 2.2606965619690564, 'temp': 0.9586732479643196, 'alpha_loss': 2.6586992984132585, 'alpha': 1.0215142763738383, 'critic_loss': -1.0089621576901429, 'actor_loss': 1.6619744485043229, 'time_step': 0.09059124368272788, 'td_error': 0.2244357153328237, 'init_value': 1.1041191816329956, 'ave_value': 1.1022191792748874} step=686
2022-04-06 18:45.51 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220406184447/model_686.pt


Epoch 3/5:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-06 18:46.25 [info     ] CQL_20220406184447: epoch=3 step=1029 epoch=3 metrics={'time_sample_batch': 0.00031283228459928194, 'time_algorithm_update': 0.0978050558629606, 'temp_loss': 1.3170170980361737, 'temp': 0.9418875839202814, 'alpha_loss': 7.240036223442145, 'alpha': 0.9993701052387671, 'critic_loss': -4.852770173514897, 'actor_loss': 2.0584373998920005, 'time_step': 0.09819418626345977, 'td_error': 0.15648950364463302, 'init_value': 1.2321447134017944, 'ave_value': 1.228177076818056} step=1029
2022-04-06 18:46.25 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220406184447/model_1029.pt


Epoch 4/5:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-06 18:46.58 [info     ] CQL_20220406184447: epoch=4 step=1372 epoch=4 metrics={'time_sample_batch': 0.0003068141269961877, 'time_algorithm_update': 0.09243900157272295, 'temp_loss': 0.7861016806291075, 'temp': 0.9292568977998228, 'alpha_loss': 9.583377957691605, 'alpha': 0.9631524921853758, 'critic_loss': -6.50581574926571, 'actor_loss': 2.4692244161322234, 'time_step': 0.09282089391880759, 'td_error': 0.11061606113975374, 'init_value': 1.2937169075012207, 'ave_value': 1.2894880254542218} step=1372
2022-04-06 18:46.58 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220406184447/model_1372.pt


Epoch 5/5:   0%|          | 0/343 [00:00<?, ?it/s]

2022-04-06 18:47.32 [info     ] CQL_20220406184447: epoch=5 step=1715 epoch=5 metrics={'time_sample_batch': 0.0003132173688349154, 'time_algorithm_update': 0.09711221842307044, 'temp_loss': 0.4085727234102199, 'temp': 0.9204262795670735, 'alpha_loss': 10.972351727610775, 'alpha': 0.924280771708697, 'critic_loss': -7.285912782040699, 'actor_loss': 2.907983114698538, 'time_step': 0.09750191602345458, 'td_error': 0.1216746191484739, 'init_value': 0.8908321857452393, 'ave_value': 0.8893791525494973} step=1715
2022-04-06 18:47.32 [info     ] Model parameters are saved to d3rlpy_logs/CQL_20220406184447/model_1715.pt


[(1,
  {'time_sample_batch': 0.00034665783362207886,
   'time_algorithm_update': 0.09301192836928299,
   'temp_loss': 4.434645806387632,
   'temp': 0.9835434387446145,
   'alpha_loss': -10.887004488411172,
   'alpha': 1.0149100758243927,
   'critic_loss': 11.504611487291298,
   'actor_loss': -0.4734545949420216,
   'time_step': 0.09343911398951583,
   'td_error': 0.23003137005684537,
   'init_value': 0.358516663312912,
   'ave_value': 0.3585621578675918}),
 (2,
  {'time_sample_batch': 0.00030645267608909495,
   'time_algorithm_update': 0.09020866944560504,
   'temp_loss': 2.2606965619690564,
   'temp': 0.9586732479643196,
   'alpha_loss': 2.6586992984132585,
   'alpha': 1.0215142763738383,
   'critic_loss': -1.0089621576901429,
   'actor_loss': 1.6619744485043229,
   'time_step': 0.09059124368272788,
   'td_error': 0.2244357153328237,
   'init_value': 1.1041191816329956,
   'ave_value': 1.1022191792748874}),
 (3,
  {'time_sample_batch': 0.00031283228459928194,
   'time_algorithm_update

## Off-Policy Evaluation

We do get some metrics on a test set of initial state value and average value. However, these estimates (using the critic's Q-function) of model performance are biased. They're useful for validation during training, but not much else. Instead, we fit a Q-function to the data (or a separate dataset, as I've done here) separately and evaluate the model's performance on it.

Feel free to change the chunks and number of steps.

In [107]:
from d3rlpy.ope import FQE
# metrics to evaluate with
from d3rlpy.metrics.scorer import soft_opc_scorer


ope_dataset = get_dataset([4,6,8,10]) #change if you'd prefer different chunks
ope_train_episodes, ope_test_episodes = train_test_split(ope_dataset, test_size=0.2)

fqe = FQE(algo=model, use_gpu=False) #change this if you have one!
fqe.fit(ope_train_episodes, eval_episodes=ope_test_episodes,
        n_steps=10000, #change if overfitting/underfitting
        scorers={
           'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer,
           'soft_opc': soft_opc_scorer(return_threshold=0)
        })

[ 0.00000000e+00  7.95731469e+08 -7.24891077e-02 -1.35999953e-02
 -4.20001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -6.23311010e-02 -1.64283998e-01  6.00000000e-01]
Read chunk # 5 out of 10000
[ 0.00000000e+00  7.95731469e+08  7.01089229e-03 -4.19999531e-03
  7.39998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  2.21623335e-01 -2.86362315e-02 -8.00043364e-02]
Read chunk # 7 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.03989108e-01 -1.37999953e-02
  7.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  2.76352555e-01 -3.26280816e-01  6.00000000e-01]
Read chunk # 9 out of 10000
[ 0.00000000e+00  7.95731469e+08  9.46108923e-02 -1.43999953e-02
 -1.50001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.56237335e-02  2.60569314e-01 -6.00000000e-01]
Read chunk # 11 out of 10000
torch.Size([111080, 6])
2022-04-06 18:50.25 [debug    ] RandomIterator is selected.
2022-04-06 18:50.25 [info     ] Directory is created at d3rlpy_logs/FQE_20220406185

Epoch 1/1:   0%|          | 0/10000 [00:00<?, ?it/s]

2022-04-06 18:50.45 [info     ] FQE_20220406185025: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.00022144982814788818, 'time_algorithm_update': 0.0016423542022705078, 'loss': 0.008170671206247244, 'time_step': 0.0019714040517807008, 'init_value': -1.2672375440597534, 'ave_value': -1.2678414611500632, 'soft_opc': nan} step=10000
2022-04-06 18:50.45 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406185025/model_10000.pt


[(1,
  {'time_sample_batch': 0.00022144982814788818,
   'time_algorithm_update': 0.0016423542022705078,
   'loss': 0.008170671206247244,
   'time_step': 0.0019714040517807008,
   'init_value': -1.2672375440597534,
   'ave_value': -1.2678414611500632,
   'soft_opc': nan})]