In [2]:
import gym
from datetime import datetime

# DR TRPO related files
from train_helper import *
from value import NNValueFunction
from utils import Logger
from dr_policy import DRPolicyKL, DRPolicyWass

# Discrete State Space - KL DRPO 
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [3]:
env_name = 'Taxi-v3'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 5000
batch_eps = 60
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        policy.update(observes, actions, advantages, disc_freqs, env_name)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
optimal beta is: 10.759811894736535
Instructions for updating:
Use tf.cast instead.
***** Episode 60, Mean Return = -771.7, Mean Discounted Return = -39.9 *****
ExplainedVarNew: -1.96e-09
ExplainedVarOld: -7.42e-07
ValFuncLoss: 698


optimal beta is: 10.969617453913648
***** Episode 120, Mean Return = -743.3, Mean Discounted Return = -38.0 *****
ExplainedVarNew: -1.17e-10
ExplainedVarOld: -1.71e-09
ValFuncLoss: 192


optimal beta is: 10.362389125335996
***** Episode 180, Mean Return = -701.6, Mean Discounted Return = -37.4 *****
ExplainedVarNew: -7.51e-11
ExplainedVarOld: -1.35e-10
ValFuncLoss: 116


optimal beta is: 10.80828070634243
***** Episode 240, Mean Return = -677.1, Mean Discounted Return = -34.7 *****
ExplainedVarNew: -6.3e-10
ExplainedVarOld: -3.21e-10
ValFuncLoss: 119


optimal beta is: 10.807662990490586
*

optimal beta is: 6.49715097948477
***** Episode 2460, Mean Return = -60.2, Mean Discounted Return = -10.7 *****
ExplainedVarNew: -0.0124
ExplainedVarOld: -0.0107
ValFuncLoss: 117


optimal beta is: 6.343450389757259
***** Episode 2520, Mean Return = -61.0, Mean Discounted Return = -11.4 *****
ExplainedVarNew: -0.0233
ExplainedVarOld: -0.0224
ValFuncLoss: 121


optimal beta is: 6.69513876269245
***** Episode 2580, Mean Return = -49.6, Mean Discounted Return = -10.4 *****
ExplainedVarNew: -0.0319
ExplainedVarOld: -0.0176
ValFuncLoss: 135


optimal beta is: 6.681701387126764
***** Episode 2640, Mean Return = -50.5, Mean Discounted Return = -12.0 *****
ExplainedVarNew: -0.0322
ExplainedVarOld: -0.0218
ValFuncLoss: 113


optimal beta is: 5.981690027084864
***** Episode 2700, Mean Return = -47.0, Mean Discounted Return = -11.6 *****
ExplainedVarNew: -0.0412
ExplainedVarOld: -0.0615
ValFuncLoss: 128


optimal beta is: 5.28836279321935
***** Episode 2760, Mean Return = -29.4, Mean Discounted R

In [None]:
total_length = 0
total_success_dropoff = 0
total_illegal_action = 0
for i in range(1000):
    illegal_action, success_dropoff, eps_length = episode_stats(env, policy)
    total_illegal_action += illegal_action
    total_success_dropoff += success_dropoff
    total_length += eps_length
    print('------------------------')
print(total_illegal_action/1000)
print(total_success_dropoff/1000)
print(total_length/1000)

# Discrete State Space - Wasserstein DRPO 
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [3]:
env_name = 'Taxi-v3'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 5000
batch_eps = 60
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
optimal beta is: 3.3667469155807015
Instructions for updating:
Use tf.cast instead.
***** Episode 60, Mean Return = -751.1, Mean Discounted Return = -36.8 *****
ExplainedVarNew: -9.55e-10
ExplainedVarOld: -1.88e-05
ValFuncLoss: 707


optimal beta is: 3.1700309639963336
***** Episode 120, Mean Return = -751.2, Mean Discounted Return = -38.3 *****
ExplainedVarNew: -1.12e-10
ExplainedVarOld: -1.9e-09
ValFuncLoss: 225


optimal beta is: 2.956200697425149
***** Episode 180, Mean Return = -722.1, Mean Discounted Return = -38.2 *****
ExplainedVarNew: -9.88e-11
ExplainedVarOld: -2.37e-10
ValFuncLoss: 155


optimal beta is: 3.0488891966421092
***** Episode 240, Mean Return = -728.0, Mean Discounted Return = -40.0 *****
ExplainedVarNew: -1.25e-10
ExplainedVarOld: -1.43e-10
ValFuncLoss: 131


optimal beta is: 3.108267867238275
**

***** Episode 2820, Mean Return = -166.4, Mean Discounted Return = -19.8 *****
ExplainedVarNew: -0.0514
ExplainedVarOld: -0.0577
ValFuncLoss: 314


***** Episode 2880, Mean Return = -118.7, Mean Discounted Return = -19.3 *****
ExplainedVarNew: -0.0593
ExplainedVarOld: -0.0838
ValFuncLoss: 364


***** Episode 2940, Mean Return = -150.7, Mean Discounted Return = -20.2 *****
ExplainedVarNew: -0.0558
ExplainedVarOld: -0.0755
ValFuncLoss: 252


***** Episode 3000, Mean Return = -105.0, Mean Discounted Return = -17.3 *****
ExplainedVarNew: -0.0562
ExplainedVarOld: -0.0683
ValFuncLoss: 348


***** Episode 3060, Mean Return = -138.1, Mean Discounted Return = -17.1 *****
ExplainedVarNew: -0.0596
ExplainedVarOld: -0.0494
ValFuncLoss: 322


***** Episode 3120, Mean Return = -112.1, Mean Discounted Return = -17.3 *****
ExplainedVarNew: -0.0762
ExplainedVarOld: -0.0619
ValFuncLoss: 342


***** Episode 3180, Mean Return = -131.8, Mean Discounted Return = -18.9 *****
ExplainedVarNew: -0.0812
Explaine

In [None]:
total_length = 0
total_success_dropoff = 0
total_illegal_action = 0
for i in range(1000):
    illegal_action, success_dropoff, eps_length = episode_stats(env, policy)
    total_illegal_action += illegal_action
    total_success_dropoff += success_dropoff
    total_length += eps_length
    print('------------------------')
print(total_illegal_action/1000)
print(total_success_dropoff/1000)
print(total_length/1000)