# Sample Workflow for d3rlpy Experiments

In [73]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import math
import subprocess
import os
import d3rlpy
plt.style.use('matplotlibrc')

from Python.data_sampler import *

## Building an MDPDataset

We first read in a large batch of samples from the file. As `d3rlpy` wants it in the form (observations, actions, rewards, terminal flags), we go ahead and do that. Here's a helper function to get a dataset from a list of chunks of your choosing.

In [74]:
def get_dataset(chunks, batch_size=30000, 
                path="collected_data/rl_det_small.txt"):
    random.seed(0)
    samples = DataSampler(path_to_data=path)
    states = []
    actions = []
    rewards = []
    next_states = []
    for chunk in chunks:
        samples.use_chunk(chunk)
        samples.read_chunk()
        [statesChunk, actionsChunk, rewardsChunk, nextStatesChunk] = samples.get_batch(batch_size)
        states.append(statesChunk)
        actions.append(actionsChunk)
        rewards.append(rewardsChunk)
        next_states.append(nextStatesChunk)
    states = torch.cat(states)
    actions = torch.cat(actions)
    rewards = torch.cat(rewards)
    next_states = torch.cat(next_states)
    terminals = np.zeros(len(states))
    terminals[::100] = 1 #episode length 100, change if necessary
    print(states.shape)
    dataset = d3rlpy.dataset.MDPDataset(states.numpy(), 
                                        actions.numpy(), 
                                        rewards.numpy(), terminals)
    return dataset

We can build the dataset from there, just like this, and split into train and test sets.

In [75]:
dataset = get_dataset([3,5,7,9])

[ 0.00000000e+00  7.95731469e+08 -8.17891077e-02 -1.19999531e-03
  7.39998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  2.09713430e-01 -2.63658359e-01  6.00000000e-01]
Read chunk # 4 out of 10000
[ 0.00000000e+00  7.95731469e+08  1.24610892e-01  2.40000469e-03
 -7.60001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -2.20016566e-01  3.79282423e-01 -6.00000000e-01]
Read chunk # 6 out of 10000
[ 0.00000000e+00  7.95731469e+08 -9.01891077e-02  1.08000047e-02
  3.99986580e-04  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -3.17973088e-02 -2.40776052e-01  6.00000000e-01]
Read chunk # 8 out of 10000
[ 0.00000000e+00  7.95731469e+08  6.91108923e-02 -5.99999531e-03
 -6.00001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -1.42355434e-01  2.22081792e-01 -6.00000000e-01]
Read chunk # 10 out of 10000
torch.Size([111080, 6])


In [76]:
from sklearn.model_selection import train_test_split
train_episodes, test_episodes = train_test_split(dataset, test_size=0.2)

## Setting up an Algorithm

In [77]:
from d3rlpy.algos import CQL
model = CQL(q_func_factory='qr',
            use_gpu=False) #change it to true if you have one
model.build_with_dataset(dataset)

In [82]:
from d3rlpy.metrics.scorer import td_error_scorer
from d3rlpy.metrics.scorer import average_value_estimation_scorer
from d3rlpy.metrics.scorer import initial_state_value_estimation_scorer

# calculate metrics with test dataset
ave_error_init = average_value_estimation_scorer(model, test_episodes)
print(ave_error_init)

-0.005670299844455813


In [None]:
model.fit(train_episodes,
        eval_episodes=test_episodes,
        n_epochs=5,
        scorers={
            'td_error': td_error_scorer,
            'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer
        })

## Off-Policy Evaluation

In [92]:
from d3rlpy.ope import FQE
# metrics to evaluate with
from d3rlpy.metrics.scorer import soft_opc_scorer


ope_dataset = get_dataset([4,6,8,10])
ope_train_episodes, ope_test_episodes = train_test_split(ope_dataset, test_size=0.2)

fqe = FQE(algo=model, use_gpu=False) #change this if you have one!
fqe.fit(ope_train_episodes, eval_episodes=ope_test_episodes,
        n_steps=10000,
        scorers={
           'init_value': initial_state_value_estimation_scorer,
            'ave_value': average_value_estimation_scorer,
           'soft_opc': soft_opc_scorer(return_threshold=600)
        })

[ 0.00000000e+00  7.95731469e+08 -7.24891077e-02 -1.35999953e-02
 -4.20001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
 -6.23311010e-02 -1.64283998e-01  6.00000000e-01]
Read chunk # 5 out of 10000
[ 0.00000000e+00  7.95731469e+08  7.01089229e-03 -4.19999531e-03
  7.39998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  2.21623335e-01 -2.86362315e-02 -8.00043364e-02]
Read chunk # 7 out of 10000
[ 0.00000000e+00  7.95731469e+08 -1.03989108e-01 -1.37999953e-02
  7.99998658e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  2.76352555e-01 -3.26280816e-01  6.00000000e-01]
Read chunk # 9 out of 10000
[ 0.00000000e+00  7.95731469e+08  9.46108923e-02 -1.43999953e-02
 -1.50001342e-03  0.00000000e+00 -5.33423489e+00 -1.57091618e+00
  1.56237335e-02  2.60569314e-01 -6.00000000e-01]
Read chunk # 11 out of 10000
torch.Size([111080, 6])
2022-04-06 18:35.50 [debug    ] RandomIterator is selected.
2022-04-06 18:35.50 [info     ] Directory is created at d3rlpy_logs/FQE_20220406183

Epoch 1/1:   0%|          | 0/10000 [00:00<?, ?it/s]

2022-04-06 18:36.09 [info     ] FQE_20220406183550: epoch=1 step=10000 epoch=1 metrics={'time_sample_batch': 0.0002603496313095093, 'time_algorithm_update': 0.001567529821395874, 'loss': 0.012148606970629952, 'time_step': 0.0019286755800247191, 'init_value': -1.5798275470733643, 'ave_value': -1.5798906309479264, 'soft_opc': nan} step=10000
2022-04-06 18:36.09 [info     ] Model parameters are saved to d3rlpy_logs/FQE_20220406183550/model_10000.pt


[(1,
  {'time_sample_batch': 0.0002603496313095093,
   'time_algorithm_update': 0.001567529821395874,
   'loss': 0.012148606970629952,
   'time_step': 0.0019286755800247191,
   'init_value': -1.5798275470733643,
   'ave_value': -1.5798906309479264,
   'soft_opc': nan})]