In [3]:
import gym
import pybullet
import pybullet_envs
from gym import wrappers
from datetime import datetime

# DR TRPO related files
from train_helper import *
from value import NNValueFunction
from utils import Logger
from dr_policy import DRPolicyKL, DRPolicyWass

# Discrete State Space - KL DR TRPO Policy
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [13]:
env_name = 'Taxi-v3'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 5000
batch_eps = 60
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        policy.update(observes, actions, advantages, disc_freqs)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 60, Mean Return = -765.9, Mean Discounted Return = -39.9 *****
ExplainedVarNew: -8.31e-09
ExplainedVarOld: -4.01e-06
ValFuncLoss: 676


***** Episode 120, Mean Return = -499.3, Mean Discounted Return = -29.1 *****
ExplainedVarNew: -8.27e-10
ExplainedVarOld: -8.8e-09
ValFuncLoss: 108


***** Episode 180, Mean Return = -305.3, Mean Discounted Return = -20.1 *****
ExplainedVarNew: -1.35e-08
ExplainedVarOld: -3.09e-09
ValFuncLoss: 103


***** Episode 240, Mean Return = -254.0, Mean Discounted Return = -15.9 *****
ExplainedVarNew: -0.00981
ExplainedVarOld: -2.01e-08
ValFuncLoss: 49


***** Episode 300, Mean Return = -182.4, Mean Discounted Return = -12.0 *****
ExplainedVarNew: -0.00545
ExplainedVarOld: -0.0155
ValFuncLoss: 29.4


***** Episode 360, Mean Return = -171.2, Mean Discounted Return = -13.6 *****
ExplainedVarNew: -0.0015
ExplainedVarOld: -0.0041
ValFuncLoss: 26.6


***** Episode 420, Mean Return = -142.7, Mean Discoun

***** Episode 3420, Mean Return = -27.8, Mean Discounted Return = -6.0 *****
ExplainedVarNew: -0.0228
ExplainedVarOld: -0.0205
ValFuncLoss: 197


***** Episode 3480, Mean Return = -5.6, Mean Discounted Return = -5.7 *****
ExplainedVarNew: -0.049
ExplainedVarOld: -0.048
ValFuncLoss: 150


***** Episode 3540, Mean Return = -15.2, Mean Discounted Return = -5.8 *****
ExplainedVarNew: -0.0276
ExplainedVarOld: -0.0351
ValFuncLoss: 208


***** Episode 3600, Mean Return = -21.0, Mean Discounted Return = -4.6 *****
ExplainedVarNew: -0.0297
ExplainedVarOld: -0.0238
ValFuncLoss: 221


***** Episode 3660, Mean Return = -7.1, Mean Discounted Return = -3.8 *****
ExplainedVarNew: -0.0413
ExplainedVarOld: -0.0487
ValFuncLoss: 194


***** Episode 3720, Mean Return = -6.6, Mean Discounted Return = -4.8 *****
ExplainedVarNew: -0.0386
ExplainedVarOld: -0.049
ValFuncLoss: 179


***** Episode 3780, Mean Return = -16.1, Mean Discounted Return = -8.0 *****
ExplainedVarNew: -0.0577
ExplainedVarOld: -0.0408
Val

# Discrete State Space - Wasserstein DR TRPO Policy
### 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

In [None]:
env_name = 'Taxi-v3'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 5000
batch_eps = 60
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages, disc_freqs)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 60, Mean Return = -793.4, Mean Discounted Return = -39.0 *****
ExplainedVarNew: -8.92e-10
ExplainedVarOld: -5.32e-08
ValFuncLoss: 742


***** Episode 120, Mean Return = -803.1, Mean Discounted Return = -38.2 *****
ExplainedVarNew: -6.52e-12
ExplainedVarOld: -9.67e-11
ValFuncLoss: 392


***** Episode 180, Mean Return = -825.6, Mean Discounted Return = -38.8 *****
ExplainedVarNew: -5.6e-12
ExplainedVarOld: -1.29e-11
ValFuncLoss: 243


***** Episode 240, Mean Return = -797.5, Mean Discounted Return = -38.6 *****
ExplainedVarNew: -9.44e-12
ExplainedVarOld: -9.33e-12
ValFuncLoss: 297


***** Episode 300, Mean Return = -689.1, Mean Discounted Return = -35.1 *****
ExplainedVarNew: -2.54e-11
ExplainedVarOld: -7.79e-12
ValFuncLoss: 185


***** Episode 360, Mean Return = -659.3, Mean Discounted Return = -36.4 *****
ExplainedVarNew: -0.00166
ExplainedVarOld: -9.46e-12
ValFuncLoss: 168


***** Episode 420, Mean Return = -648.7, Mean D

***** Episode 3360, Mean Return = -92.0, Mean Discounted Return = -9.3 *****
ExplainedVarNew: -0.135
ExplainedVarOld: -0.097
ValFuncLoss: 75.2


***** Episode 3420, Mean Return = -154.6, Mean Discounted Return = -13.9 *****
ExplainedVarNew: -0.0674
ExplainedVarOld: -0.302
ValFuncLoss: 72.9


***** Episode 3480, Mean Return = -111.7, Mean Discounted Return = -10.2 *****
ExplainedVarNew: -0.109
ExplainedVarOld: -0.128
ValFuncLoss: 57.7


***** Episode 3540, Mean Return = -137.2, Mean Discounted Return = -11.6 *****
ExplainedVarNew: -0.14
ExplainedVarOld: -0.147
ValFuncLoss: 58.4


***** Episode 3600, Mean Return = -134.1, Mean Discounted Return = -10.7 *****
ExplainedVarNew: -0.158
ExplainedVarOld: -0.152
ValFuncLoss: 47.4


***** Episode 3660, Mean Return = -101.0, Mean Discounted Return = -10.3 *****
ExplainedVarNew: -0.106
ExplainedVarOld: -0.129
ValFuncLoss: 61.8


***** Episode 3720, Mean Return = -88.0, Mean Discounted Return = -8.4 *****
ExplainedVarNew: -0.0967
ExplainedVarOld: -