In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

import matplotlib.pyplot as plt

import cvxpy as cp
import time

from utils import reward_direct_policy_evaluation, cost_direct_policy_evaluation

The defaults go here 

In [3]:
SEED = 0

## User params

In [4]:
LAMBDA_R = 1.0
LAMBDA_C = 1.0

COEFFS = (LAMBDA_R, LAMBDA_C)

EPS = 2.5 

In [5]:
# ----------- Defaults MDP parameters -------------
nS, nA = 750, 25
DEATH_STATE = 750
SURVIVAL_STATE = 751

gamma = 0.99

# ----- User args 
FREQUENCY_THRESHOLD = 100.0
COST_FOR_RARE_DECISION = 10.0


# -------- Folder Paths -------------
basepath = '/enter/path/here'

# Path variables
IMPORT_PATH = f'{basepath}/m_hat/{SEED}'
OUTPUT_PATH = f'{basepath}/output/{SEED}/freq_{FREQUENCY_THRESHOLD}_cost_{COST_FOR_RARE_DECISION}'

In [6]:
np.random.seed(SEED)

### Estimate the filtered the MDP stats

In [7]:
traj_tr = pickle.load(open(f'{IMPORT_PATH}/trajDr_tr.pkl', 'rb'))
print('Effective sample size of train set', len(traj_tr))

Effective sample size of train set 14667


In [8]:
# load the defaults for this seed
unflitered_P_mat, R_mat = pickle.load(open(f"{IMPORT_PATH}/MDP_mat.p", "rb"))
orig_counts_mat = pickle.load(open(f"{IMPORT_PATH}/MDP_counts.p", "rb"))

# use the orignial cost matrix for unfiltered actions
C_mat = pickle.load(open(f"{OUTPUT_PATH}/C_mat.p", "rb"))
pi_baseline = pickle.load(open(f"{OUTPUT_PATH}/pi_baseline.p", "rb"))

In [9]:

COST_FOR_RARE_DECISION = 10
# code to replace all postivie cost value with 0 

C_mat[C_mat>0] = COST_FOR_RARE_DECISION

In [10]:
# --------- NOTE: New step of Filtering here ---------

# remove the +ve cost from MLE model
trans_counts_mat = np.copy(orig_counts_mat)
low_freq_idx = trans_counts_mat.sum(axis=-1) < FREQUENCY_THRESHOLD
trans_counts_mat[low_freq_idx] = 0


# assign absorbing states
assert trans_counts_mat[DEATH_STATE, :, :].sum() == 0
assert trans_counts_mat[SURVIVAL_STATE, :, :].sum() == 0
# Add the death / life absorbing state
trans_counts_mat[DEATH_STATE, :, DEATH_STATE] = 1
trans_counts_mat[SURVIVAL_STATE, :, SURVIVAL_STATE] = 1

# Note: *Not in original paper* send any unobserved actions to death
no_tx_idx = trans_counts_mat.sum(axis=-1) == 0
trans_counts_mat[no_tx_idx, DEATH_STATE] = 1

# Normalise the transition counts
# Build probabilistic MDP model

# Convert counts into probability
P_mat = trans_counts_mat / trans_counts_mat.sum(axis=-1, keepdims=True)
assert np.allclose(1, P_mat.sum(axis=-1))

### Find the solution for this unflitered P_mat

In [11]:
from utils import make_policy_iteration_operator, bounded_successive_approximation, default_termination

pi_operator = make_policy_iteration_operator(P=P_mat, R=R_mat, discount=gamma)
random_policy = np.ones((nS+2, nA))/nA

start_time = time.time()
filtered_pi_sol = bounded_successive_approximation(random_policy,
                                                   operator=pi_operator,
                                                   termination_condition=default_termination,
                                                   max_limit= 10,)

time_elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time()-start_time))
print(f"Regular PI completed in {time_elapsed}")

Regular PI completed in 00:00:21


In [12]:
from utils import make_reward_shaping_policy_iteration_operator

rs_operator = make_reward_shaping_policy_iteration_operator(P=unflitered_P_mat, R=R_mat, C=C_mat, discount=gamma, coeffs=COEFFS)

start_time = time.time()
pi_rs = bounded_successive_approximation(random_policy, operator=rs_operator,
                                                   termination_condition=default_termination,
                                                   max_limit= 10,)

time_elapsed = time.strftime("%H:%M:%S", time.gmtime(time.time()-start_time))
print(f"Regular PI completed in {time_elapsed}")

Regular PI completed in 00:00:51


### Load the pre-computed solutions

In [13]:
def load_sopt_policy(coeffs, eps):    
    sol_name = f'cpi_{coeffs[0]}_{coeffs[1]}_{eps}.p'    
    pi_solution = pickle.load(open(f'{OUTPUT_PATH}/{sol_name}', 'rb'))    
    return pi_solution


def load_rs_policy(coeffs):    
    sol_name = f'rs_{coeffs[0]}_{coeffs[1]}.p'    
    pi_solution = pickle.load(open(f'{OUTPUT_PATH}/{sol_name}', 'rb'))    
    return pi_solution

In [17]:
orig_count_sa = orig_counts_mat.sum(axis=-1)

In [18]:
orig_state_count = orig_count_sa.sum(-1)

## Empirical analysis for Unfiltered PI

In [19]:
from eval_utils import Evaluator
evaluator = Evaluator(gamma=gamma,
                      pi_baseline=pi_baseline,
                      C_mat=C_mat,
                      cost_for_rare_decision=COST_FOR_RARE_DECISION,
                      n_bootstrap=10,)

traj_te = pickle.load(open('./trajDr_te.pkl', 'rb'))
test_trajectories = evaluator.preprocess_trajecteories(traj_te)

N_test = len(test_trajectories)
print('Effective sample size of test set', N_test)

Effective sample size of test set 3605


In [20]:
# get the mean 
test_mean_stats = evaluator.get_mean_stats(test_trajectories)
print(f'R: {test_mean_stats[0]}, C: {test_mean_stats[1]}')

R: 69.53381010466066, C: 53.26411127039579


In [21]:
from utils import reward_direct_policy_evaluation, cost_direct_policy_evaluation

# ------- Note what P_mat to use?
# R_sa = np.einsum('sat,sat -> sa', R_mat, P_mat)
R_sa = np.einsum('sat,sat -> sa', R_mat, unflitered_P_mat)


def do_ope(pi_solution, mode='DR'):
    rV_sol = reward_direct_policy_evaluation(unflitered_P_mat, R_mat, gamma, pi_solution)
    rQ_sol = R_sa + gamma * np.einsum('sat,t -> sa', unflitered_P_mat, rV_sol)

    cV_sol = cost_direct_policy_evaluation(unflitered_P_mat, C_mat, gamma, pi_solution)
    cQ_sol = C_mat + gamma * np.einsum('sat,t -> sa', unflitered_P_mat, cV_sol)


    if mode=='DR':
        dr_mean_stats, _, _ = evaluator.doubly_robust_ope(test_trajectories, pi_e=pi_solution, 
                                                                              rQ_e=rQ_sol, cQ_e= cQ_sol)
    elif mode == 'WDR':
        dr_mean_stats, _, _ = evaluator.weighted_doubly_robust_ope(test_trajectories, pi_e=pi_solution, 
                                                                              rQ_e=rQ_sol, cQ_e= cQ_sol)

    print('-- Mean stats ---')
    print(f'R: {dr_mean_stats[0]:0.2f}, C: {dr_mean_stats[1]:0.2f}')

In [22]:
do_ope(pi_baseline, mode='DR')

100%|██████████| 10/10 [00:00<00:00, 346.30it/s]

-- Mean stats ---
R: 65.95, C: 59.63





In [23]:
# this shows that removing/filtering the rewards can help with reducing cost, but they might also cause Reward performance
do_ope(filtered_pi_sol, mode='DR')

100%|██████████| 10/10 [00:00<00:00, 389.86it/s]

-- Mean stats ---
R: 11.26, C: 18.40





In [25]:
pi_sopt = load_sopt_policy(COEFFS, EPS)
do_ope(pi_sopt, mode='DR')

100%|██████████| 10/10 [00:00<00:00, 98.04it/s]

-- Mean stats ---
R: 86.75, C: 24.66



