In [3]:
import gym
import pybullet
import pybullet_envs
from gym import wrappers
from datetime import datetime

# DR TRPO related files
from train_helper import *
from value import NNValueFunction
from utils import Logger
from dr_policy import DRPolicyKL, DRPolicyWass

# Discrete State Space - KL DR TRPO Policy
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [13]:
env_name = 'Taxi-v3'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 5000
batch_eps = 60
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        policy.update(observes, actions, advantages, disc_freqs)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 60, Mean Return = -765.9, Mean Discounted Return = -39.9 *****
ExplainedVarNew: -8.31e-09
ExplainedVarOld: -4.01e-06
ValFuncLoss: 676


***** Episode 120, Mean Return = -499.3, Mean Discounted Return = -29.1 *****
ExplainedVarNew: -8.27e-10
ExplainedVarOld: -8.8e-09
ValFuncLoss: 108


***** Episode 180, Mean Return = -305.3, Mean Discounted Return = -20.1 *****
ExplainedVarNew: -1.35e-08
ExplainedVarOld: -3.09e-09
ValFuncLoss: 103


***** Episode 240, Mean Return = -254.0, Mean Discounted Return = -15.9 *****
ExplainedVarNew: -0.00981
ExplainedVarOld: -2.01e-08
ValFuncLoss: 49


***** Episode 300, Mean Return = -182.4, Mean Discounted Return = -12.0 *****
ExplainedVarNew: -0.00545
ExplainedVarOld: -0.0155
ValFuncLoss: 29.4


***** Episode 360, Mean Return = -171.2, Mean Discounted Return = -13.6 *****
ExplainedVarNew: -0.0015
ExplainedVarOld: -0.0041
ValFuncLoss: 26.6


***** Episode 420, Mean Return = -142.7, Mean Discoun

***** Episode 3420, Mean Return = -27.8, Mean Discounted Return = -6.0 *****
ExplainedVarNew: -0.0228
ExplainedVarOld: -0.0205
ValFuncLoss: 197


***** Episode 3480, Mean Return = -5.6, Mean Discounted Return = -5.7 *****
ExplainedVarNew: -0.049
ExplainedVarOld: -0.048
ValFuncLoss: 150


***** Episode 3540, Mean Return = -15.2, Mean Discounted Return = -5.8 *****
ExplainedVarNew: -0.0276
ExplainedVarOld: -0.0351
ValFuncLoss: 208


***** Episode 3600, Mean Return = -21.0, Mean Discounted Return = -4.6 *****
ExplainedVarNew: -0.0297
ExplainedVarOld: -0.0238
ValFuncLoss: 221


***** Episode 3660, Mean Return = -7.1, Mean Discounted Return = -3.8 *****
ExplainedVarNew: -0.0413
ExplainedVarOld: -0.0487
ValFuncLoss: 194


***** Episode 3720, Mean Return = -6.6, Mean Discounted Return = -4.8 *****
ExplainedVarNew: -0.0386
ExplainedVarOld: -0.049
ValFuncLoss: 179


***** Episode 3780, Mean Return = -16.1, Mean Discounted Return = -8.0 *****
ExplainedVarNew: -0.0577
ExplainedVarOld: -0.0408
Val

# Discrete State Space - Wasserstein DR TRPO Policy
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [14]:
env_name = 'Taxi-v3'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 5000
batch_eps = 60
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages, disc_freqs)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 60, Mean Return = -768.6, Mean Discounted Return = -38.4 *****
ExplainedVarNew: -6.23e-09
ExplainedVarOld: -6.45e-06
ValFuncLoss: 760


***** Episode 120, Mean Return = -704.8, Mean Discounted Return = -36.1 *****
ExplainedVarNew: -6.48e-10
ExplainedVarOld: -1.41e-08
ValFuncLoss: 239


***** Episode 180, Mean Return = -675.8, Mean Discounted Return = -37.2 *****
ExplainedVarNew: -5.75e-11
ExplainedVarOld: -1.2e-10
ValFuncLoss: 183


***** Episode 240, Mean Return = -607.8, Mean Discounted Return = -34.9 *****
ExplainedVarNew: -1.21e-10
ExplainedVarOld: -1.28e-10
ValFuncLoss: 177


***** Episode 300, Mean Return = -599.6, Mean Discounted Return = -34.7 *****
ExplainedVarNew: -2.62e-10
ExplainedVarOld: -1.97e-10
ValFuncLoss: 190


***** Episode 360, Mean Return = -542.6, Mean Discounted Return = -37.4 *****
ExplainedVarNew: -1.05e-10
ExplainedVarOld: -7.28e-11
ValFuncLoss: 193


***** Episode 420, Mean Return = -529.2, Mean 

***** Episode 3420, Mean Return = -76.2, Mean Discounted Return = -11.2 *****
ExplainedVarNew: -0.0538
ExplainedVarOld: -0.0518
ValFuncLoss: 95.6


***** Episode 3480, Mean Return = -138.2, Mean Discounted Return = -13.3 *****
ExplainedVarNew: -0.0319
ExplainedVarOld: -0.0325
ValFuncLoss: 105


***** Episode 3540, Mean Return = -98.6, Mean Discounted Return = -11.2 *****
ExplainedVarNew: -0.0474
ExplainedVarOld: -0.0559
ValFuncLoss: 91.5


***** Episode 3600, Mean Return = -76.9, Mean Discounted Return = -10.5 *****
ExplainedVarNew: -0.0599
ExplainedVarOld: -0.0559
ValFuncLoss: 80.5


***** Episode 3660, Mean Return = -68.5, Mean Discounted Return = -9.4 *****
ExplainedVarNew: -0.073
ExplainedVarOld: -0.0875
ValFuncLoss: 71.7


***** Episode 3720, Mean Return = -87.8, Mean Discounted Return = -10.5 *****
ExplainedVarNew: -0.0546
ExplainedVarOld: -0.0523
ValFuncLoss: 91.1


***** Episode 3780, Mean Return = -49.1, Mean Discounted Return = -8.9 *****
ExplainedVarNew: -0.0458
ExplainedVar