In [1]:
import gym
from datetime import datetime

# DR TRPO related files
from train_helper import *
from value import NNValueFunction
from utils import Logger
from dr_policy import DRPolicyKL, DRPolicyWass, DRPolicySinkhorn

# Discrete State Space - KL DRPO 
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [2]:
env_name = 'Taxi-v3'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 1000
batch_eps = 60
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        policy.update(observes, actions, advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 60, Mean Return = -755.4, Mean Discounted Return = -39.9 *****
ExplainedVarNew: -4.75e-10
ExplainedVarOld: -4.64e-07
ValFuncLoss: 709


***** Episode 120, Mean Return = -483.6, Mean Discounted Return = -25.3 *****
ExplainedVarNew: -7.83e-11
ExplainedVarOld: -7.43e-10
ValFuncLoss: 110


***** Episode 180, Mean Return = -262.7, Mean Discounted Return = -16.6 *****
ExplainedVarNew: -2.64e-10
ExplainedVarOld: -1.35e-10
ValFuncLoss: 81.4


***** Episode 240, Mean Return = -213.2, Mean Discounted Return = -13.5 *****
ExplainedVarNew: -2.66e-08
ExplainedVarOld: -1.6e-09
ValFuncLoss: 12.4


***** Episode 300, Mean Return = -218.1, Mean Discounted Return = -14.4 *****
ExplainedVarNew: -3.44e-05
ExplainedVarOld: -6.68e-09
ValFuncLoss: 12.9


***** Episode 360, Mean Return = -231.9, Mean Discounted Return = -14.9 *****
ExplainedVarNew: -0.0121
ExplainedVarOld: -8.15e-06
ValFuncLoss: 65.5


***** Episode 420, Mean Return = -236.5, Mea

# Discrete State Space - Sinkhorn DRPO 
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [4]:
env_name = 'Taxi-v3'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicySinkhorn(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 1000
batch_eps = 60
logger = Logger(logname=env_name + '_DR-Sinkhorn_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 60, Mean Return = -784.0, Mean Discounted Return = -38.7 *****
ExplainedVarNew: -1.04e-09
ExplainedVarOld: -8.37e-07
ValFuncLoss: 749


***** Episode 120, Mean Return = -715.2, Mean Discounted Return = -38.4 *****
ExplainedVarNew: -2.3e-10
ExplainedVarOld: -3.63e-09
ValFuncLoss: 226


***** Episode 180, Mean Return = -679.5, Mean Discounted Return = -35.9 *****
ExplainedVarNew: -6.07e-11
ExplainedVarOld: -1.04e-10
ValFuncLoss: 169


***** Episode 240, Mean Return = -664.4, Mean Discounted Return = -38.3 *****
ExplainedVarNew: -1.47e-10
ExplainedVarOld: -1.07e-10
ValFuncLoss: 164


***** Episode 300, Mean Return = -617.3, Mean Discounted Return = -36.4 *****
ExplainedVarNew: -1.48e-10
ExplainedVarOld: -4.94e-11
ValFuncLoss: 195


***** Episode 360, Mean Return = -584.1, Mean Discounted Return = -34.4 *****
ExplainedVarNew: -2.84e-06
ExplainedVarOld: -2.98e-10
ValFuncLoss: 215


***** Episode 420, Mean Return = -532.3, Mean 

# Discrete State Space - Wasserstein DRPO 
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [None]:
env_name = 'NChain-v0'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 200
batch_eps = 1
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

### Policy Statistics

In [None]:
total_length = 0
total_success_dropoff = 0
total_illegal_action = 0
for i in range(1000):
    illegal_action, success_dropoff, eps_length = episode_stats(env, policy)
    total_illegal_action += illegal_action
    total_success_dropoff += success_dropoff
    total_length += eps_length
    print('------------------------')
print(total_illegal_action/1000)
print(total_success_dropoff/1000)
print(total_length/1000)