### prepare


In [None]:
import sys
sys.path.append('../')
sys.path.append('../../')

In [None]:
import time
from typing import List
from pathlib import Path
import pickle

import hydra
from omegaconf import DictConfig

import gym
from gym.spaces import Box

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
import torch
import seaborn as sns


from d3rlpy.algos import SAC
from d3rlpy.algos import DoubleDQN as DDQN
from d3rlpy.algos import CQL
from d3rlpy.algos import IQL
from d3rlpy.algos import BCQ
from d3rlpy.algos import DiscreteCQL
from d3rlpy.algos import DiscreteBCQ
from d3rlpy.online.explorers import LinearDecayEpsilonGreedy, ConstantEpsilonGreedy
from d3rlpy.models.encoders import VectorEncoderFactory
from d3rlpy.models.q_functions import MeanQFunctionFactory
from d3rlpy.online.buffers import ReplayBuffer

from scope_rl.dataset import SyntheticDataset
from scope_rl.policy import BaseHead
from scope_rl.policy import ContinuousGaussianHead as GaussianHead
from scope_rl.policy import DiscreteEpsilonGreedyHead as EpsilonGreedyHead
from scope_rl.policy import DiscreteSoftmaxHead as SoftmaxHead
from scope_rl.policy import OffPolicyLearning

from scope_rl.ope.online import visualize_on_policy_policy_value
from scope_rl.ope.online import calc_on_policy_policy_value

from scope_rl.utils import MinMaxActionScaler
from scope_rl.utils import OldGymAPIWrapper
from scope_rl.types import LoggedDataset

from experiments.utils import torch_seed, format_runtime

from basicgym import BasicEnv

from tutorial.function import train_behavior_policy
from tutorial.function import obtain_logged_dataset
from tutorial.function import train_candidate_policies
# from experiments.main import off_policy_evaluation
from tutorial.function import off_policy_evaluation

### function

In [None]:
def bias_fig(
    bias_result_df,
    estimators,
    ESTIMATORS,
    x_scales,
    x_label,
    yscale_log = False,
    xscale_log = False,
):
    plt.style.use('ggplot')
    fig, ax = plt.subplots(figsize=(10, 7), tight_layout=True)
    sns.lineplot(
        linewidth=5,
        dashes=False,
        legend=False,
        x=x_label,
        y="bias",
        hue="est",
        ax=ax,
        data=bias_result_df.query(f"(est == {estimators} and {min(x_scales)}<= {x_label} <= {max(x_scales)})"),
        ci=None,
    )
    # title and legend
    ax.legend(ESTIMATORS, loc="upper right", fontsize=25)
    # yaxis
    if yscale_log:
        ax.set_yscale("log")
    ax.set_ylabel("bias", fontsize=25)
    ax.tick_params(axis="y", labelsize=15)
    ax.yaxis.set_label_coords(-0.08, 0.5)
    # xaxis
    if xscale_log:
        ax.set_xscale("log")
    ax.set_xlabel(f"number of {x_label}", fontsize=25)
    ax.set_xticks(x_scales)
    ax.set_xticklabels(x_scales, fontsize=15)
    ax.xaxis.set_label_coords(0.5, -0.1)

    path_ = Path(log_dir + "results/fig")
    path_.mkdir(exist_ok=True, parents=True)
    save_path = Path(path_ / f"bias_result_fig_{x_label}.png")
    fig.tight_layout()
    fig.savefig(save_path, dpi=300, bbox_inches="tight")

In [None]:
def variance_fig(
    variance_result_df,
    estimators,
    ESTIMATORS,
    x_scales,
    x_label,
    yscale_log = False,
    xscale_log = False,
):
    plt.style.use('ggplot')
    fig, ax = plt.subplots(figsize=(10, 7), tight_layout=True)
    sns.lineplot(
        linewidth=5,
        dashes=False,
        legend=False,
        x=x_label,
        y="variance",
        hue="est",
        ax=ax,
        data=variance_result_df.query(f"(est == {estimators} and {min(x_scales)}<= {x_label} <= {max(x_scales)})"),
        ci=None,
    )
    # title and legend
    ax.legend(ESTIMATORS, loc="upper right", fontsize=25)
    # yaxis
    if yscale_log:
        ax.set_yscale("log")
    ax.set_ylabel("variance", fontsize=25)
    ax.tick_params(axis="y", labelsize=15)
    ax.yaxis.set_label_coords(-0.08, 0.5)
    # xaxis
    if xscale_log:
        ax.set_xscale("log")
    ax.set_xlabel(f"number of {x_label}", fontsize=25)
    ax.set_xticks(x_scales)
    ax.set_xticklabels(x_scales, fontsize=15)
    ax.xaxis.set_label_coords(0.5, -0.1)

    path_ = Path(log_dir + "results/fig")
    path_.mkdir(exist_ok=True, parents=True)
    save_path = Path(path_ / f"variance_result_fig_{x_label}.png")
    fig.tight_layout()
    fig.savefig(save_path, dpi=300, bbox_inches="tight")

In [None]:
def mse_fig(
    mse_result_df,
    estimators,
    ESTIMATORS,
    x_scales,
    x_label,
    yscale_log = False,
    xscale_log = False,
):
    plt.style.use('ggplot')
    fig, ax = plt.subplots(figsize=(10, 7), tight_layout=True)
    sns.lineplot(
        linewidth=5,
        dashes=False,
        legend=False,
        x=x_label,
        y="mse",
        hue="est",
        ax=ax,
        data=mse_result_df.query(f"(est == {estimators} and {min(x_scales)}<= {x_label} <= {max(x_scales)})"),
        ci=None,
    )
    # title and legend
    ax.legend(ESTIMATORS, loc="upper right", fontsize=25)
    # yaxis
    if yscale_log:
        ax.set_yscale("log")
    ax.set_ylabel("mse", fontsize=25)
    ax.tick_params(axis="y", labelsize=15)
    ax.yaxis.set_label_coords(-0.08, 0.5)
    # xaxis
    if xscale_log:
        ax.set_xscale("log")
    ax.set_xlabel(f"number of {x_label}", fontsize=25)
    ax.set_xticks(x_scales)
    ax.set_xticklabels(x_scales, fontsize=15)
    ax.xaxis.set_label_coords(0.5, -0.1)

    path_ = Path(log_dir + "results/fig")
    path_.mkdir(exist_ok=True, parents=True)
    save_path = Path(path_ / f"mse_result_fig_{x_label}.png")
    fig.tight_layout()
    fig.savefig(save_path, dpi=300, bbox_inches="tight")

### main

In [20]:
# env = gym.make(env_name)
#discrete
env_name="BasicEnv-discrete-v0"
action_type='discrete'
behavior_policy_name="ddqn_softmax_0.0"
candidate_policy_name="cql_b1_eps_0.0"
# behavior_policy_name="ddqn_softmax_1.0"
# candidate_policy_name="cql_b1_eps_0.1"

#continuous
# env_name="BasicEnv-continuous-v0"
# action_type='continuous'
# behavior_policy_name="sac_gauss_1.0"
# candidate_policy_name="cql_b1_gauss_0.0"

# behavior_tau=1.0
# candidate_epsilons=[0.1]
behavior_tau=0.0
candidate_epsilons=[0.0]
behavior_sigma=0.0
candidate_sigmas=[0.0]

base_random_state=12345
log_dir="../tutorial/logs/"
device="cuda:0" if torch.cuda.is_available() else "cpu"
step_per_trajectory_list =  [5, 10, 20, 40, 60, 80, 100]
step_per_trajectory = 10
n_trajectories_list=[200, 400, 800, 1600, 3200, 4800, 6400, 8000]
n_trajectories=10000
n_actions_list = [2, 4, 6, 10, 12, 14]
n_actions=5
# n_actions=5
n_random_state=10

bias_df_list = []
variance_df_list = []
mse_df_list = []

# variable_name = 'n_trajectories'
variable_name = 'n_actions'
# variable_name = 'step_per_trajectory'

# for n_trajectories in n_trajectories_list:
for n_actions in n_actions_list:
# for step_per_trajectory in step_per_trajectory_list:

    # variable=n_trajectories
    variable=n_actions
    # variable=step_per_trajectory

    env = BasicEnv(
        action_type=action_type, 
        n_actions=n_actions,
        random_state=base_random_state, 
        step_per_episode=step_per_trajectory,
    )

    behavior_policy = train_behavior_policy(
        env_name=env_name,
        env=env,
        behavior_sigma=behavior_sigma,
        behavior_tau=behavior_tau,
        device=device,
        base_random_state=base_random_state,
        log_dir=log_dir,
        variable=variable,
        variable_name=variable_name,
    )

    train_logged_dataset, test_logged_dataset = obtain_logged_dataset(
        env_name=env_name,
        env=env,
        behavior_policy=behavior_policy,
        n_trajectories=n_trajectories,
        n_random_state=n_random_state,
        base_random_state=base_random_state,
        log_dir=log_dir,
        variable=variable,
        variable_name=variable_name,
    )

    candidate_policies = train_candidate_policies(
        env_name=env_name,
        env=env,
        n_trajectories=n_trajectories,
        train_logged_dataset=train_logged_dataset,
        candidate_sigmas=candidate_sigmas,
        candidate_epsilons=candidate_epsilons,
        device=device,
        base_random_state=base_random_state,
        log_dir=log_dir,
        variable=variable,
        variable_name=variable_name,
    )

    input_dict, policy_value_dict = off_policy_evaluation(
        env_name=env_name,
        env=env,
        n_trajectories=n_trajectories,
        test_logged_dataset=test_logged_dataset,
        candidate_policies=candidate_policies,
        device=device,
        base_random_state=base_random_state,
        log_dir=log_dir,
        variable=variable,
        variable_name=variable_name,
    )

    input_dict_ = input_dict.get(
        behavior_policy_name=behavior_policy_name,
        dataset_id=0,
    )

    dict = {i : DataFrame() for i in input_dict_.keys()}
    bias_dict = {i : 0 for i in input_dict_.keys()}
    variance_dict = {i : 0 for i in input_dict_.keys()}
    mse_dict = {i : 0 for i in input_dict_.keys()}

    for dataset_id_ in range(n_random_state):
        for eval_policy in input_dict_.keys():
            dict[eval_policy] = pd.concat([dict[eval_policy] , DataFrame(policy_value_dict[behavior_policy_name][dataset_id_][eval_policy], index=[dataset_id_])])

    for eval_policy in input_dict_.keys():
        bias_dict[eval_policy] = abs(dict[eval_policy].mean(axis=0) - dict[eval_policy].mean(axis=0)['on_policy'])
        variance_dict[eval_policy] = dict[eval_policy].var(axis=0)
        mse_dict[eval_policy] = bias_dict[eval_policy]**2 + variance_dict[eval_policy]

    bias_df = DataFrame(DataFrame(bias_dict[candidate_policy_name]).stack())\
    .reset_index(0).rename(columns={"level_0": "est", 0: "bias"})
    bias_df[variable_name] = variable
    bias_df_list.append(bias_df)
    variance_df = DataFrame(DataFrame(variance_dict[candidate_policy_name]).stack())\
    .reset_index(0).rename(columns={"level_0": "est", 0: "variance"})
    variance_df[variable_name] = variable
    variance_df_list.append(variance_df)
    mse_df = DataFrame(DataFrame(mse_dict[candidate_policy_name]).stack())\
    .reset_index(0).rename(columns={"level_0": "est", 0: "mse"})
    mse_df[variable_name] = variable
    mse_df_list.append(mse_df)

# aggregate all results 
bias_result_df = pd.concat(bias_df_list).reset_index(level=0)
variance_result_df = pd.concat(variance_df_list).reset_index(level=0)
mse_result_df = pd.concat(mse_df_list).reset_index(level=0)

path_ = Path("logs" + f"/results/df")
path_.mkdir(exist_ok=True, parents=True)
path_bias = Path(path_ / f"bias_result_df_{variable_name}.pkl")
path_variance = Path(path_ / f"variance_result_df_{variable_name}.pkl")
path_mse = Path(path_ / f"mse_result_df_{variable_name}.pkl")

with open(path_bias, "wb") as f:
    pickle.dump(bias_result_df, f)
with open(path_variance, "wb") as f:
    pickle.dump(variance_result_df, f)
with open(path_mse, "wb") as f:
    pickle.dump(mse_result_df, f)


[obtain_trajectories]:   0%|          | 0/10000 [00:00<?, ?it/s]

[obtain_datasets: dataset_id]:   0%|          | 0/10 [00:00<?, ?it/s]

[obtain_trajectories]:   0%|          | 0/10000 [00:00<?, ?it/s]

[obtain_trajectories]:   0%|          | 0/10000 [00:00<?, ?it/s]

[obtain_trajectories]:   0%|          | 0/10000 [00:00<?, ?it/s]

[obtain_trajectories]:   0%|          | 0/10000 [00:00<?, ?it/s]

[obtain_trajectories]:   0%|          | 0/10000 [00:00<?, ?it/s]

[obtain_trajectories]:   0%|          | 0/10000 [00:00<?, ?it/s]

In [None]:
# abs(dict['cql_b1_eps_0.1'].mean(axis=0) - dict['cql_b1_eps_0.1'].mean(axis=0)['on_policy'])
# (dict['cql_b1_eps_0.1'].sub(dict['cql_b1_eps_0.1']['on_policy'], axis=0))

In [None]:
result = pd.concat([bias_result_df.drop('bias', axis=1), bias_result_df['bias'], variance_result_df['variance'], mse_result_df['mse']], axis=1)

In [None]:
result

In [None]:
basic_estimators = ["DM", "TIS", "PDIS", "DR", "SNTIS", "SNPDIS", "SNDR"]
state_marginal_estimators = ["SMIS", "SMDR", "SMSNIS", "SMSNDR"]
state_action_marginal_estimators = ["SAMIS", "SAMDR", "SAMSNIS", "SAMSNDR"]
drl_estimators = ["DRL"]
all_estimators = basic_estimators + state_marginal_estimators + state_action_marginal_estimators + drl_estimators

basic_estimators_name = ["dm", "tis", "pdis", "dr", "sntis", "snpdis", "sndr"]
state_marginal_estimators_name = ["sm_is", "sm_dr", "sm_snis", "sm_sndr"]
state_action_marginal_estimators_name = ["sam_is", "sam_dr", "sam_snis", "sam_sndr"]
drl_estimators_name = ["drl"]
all_estimators_name = basic_estimators_name + state_marginal_estimators_name + state_action_marginal_estimators_name + drl_estimators_name

In [None]:
# n_trajectories
# x_scales=[200, 400, 800, 1600, 3200,4800, 6400, 8000]
# x_label='n_trajectories'

# step_per_trajectory
# x_scales=[5, 10, 20, 40, 60, 80, 100]
# x_label='step_per_trajectory'

n_actions
x_scales=[4, 6, 8, 10, 12, 14]
x_label='n_actions'

In [None]:
bias_fig(
    bias_result_df,
    ESTIMATORS=basic_estimators,
    estimators=basic_estimators_name,
    x_scales=x_scales,
    x_label=x_label,
    # yscale_log=True,
    xscale_log=True,
)

In [None]:
variance_fig(
    variance_result_df,
    ESTIMATORS=basic_estimators,
    estimators=basic_estimators_name,
    x_scales=x_scales,
    x_label=x_label,
    # yscale_log=True,
    # xscale_log=True,
)

In [None]:
mse_fig(
    mse_result_df,
    ESTIMATORS=basic_estimators,
    estimators=basic_estimators_name,
    x_scales=x_scales,
    x_label=x_label,
    # xscale_log=True,
)
