In [5]:
import gym
import pybullet
import pybullet_envs
from gym import wrappers
from datetime import datetime

# DR TRPO related files
from train_helper import *
from value import NNValueFunction
from utils import Logger
from dr_policy import DRPolicy, DRPolicyWass
from dr_policy_cont import DRPolicyCont, DRPolicyContWass

In [6]:
#########################       Discrete State Space - KL DR Policy     ######################### 
# e.g. 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0'

env_name = 'NChain-v0'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicy(sta_num, act_num)
val_func = NNValueFunction(1, 3)
logger = Logger(logname=env_name + '_DR-KL_Batch=20', now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))
gamma = 0.8
lam = 1
total_eps = 500
batch_eps = 20

eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 

Value Params -- h1: 3, h2: 3, h3: 5, lr: 0.00577
***** Episode 20, Mean Return = 1323.4, Mean Discounted Return = 5.2 *****
ExplainedVarNew: -0.104
ExplainedVarOld: -0.00175
ValFuncLoss: 18.5


***** Episode 40, Mean Return = 1910.4, Mean Discounted Return = 7.0 *****
ExplainedVarNew: -0.325
ExplainedVarOld: -0.0342
ValFuncLoss: 88.4


***** Episode 60, Mean Return = 2352.7, Mean Discounted Return = 6.6 *****
ExplainedVarNew: -0.445
ExplainedVarOld: -0.268
ValFuncLoss: 151


***** Episode 80, Mean Return = 2555.5, Mean Discounted Return = 8.1 *****
ExplainedVarNew: -0.516
ExplainedVarOld: -0.46
ValFuncLoss: 171


***** Episode 100, Mean Return = 2731.8, Mean Discounted Return = 6.9 *****
ExplainedVarNew: -0.481
ExplainedVarOld: -0.521
ValFuncLoss: 177


***** Episode 120, Mean Return = 2722.5, Mean Discounted Return = 8.0 *****
ExplainedVarNew: -0.452
ExplainedVarOld: -0.479
ValFuncLoss: 174


***** Episode 140, Mean Return = 2907.8, Mean Discounted Return = 9.5 *****
ExplainedVarNew: 

In [7]:
#########################       Discrete State Space - Wasserstein DR Policy     ######################### 
# e.g. 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0'

env_name = 'NChain-v0'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 3)
logger = Logger(logname=env_name + '_DR-Wass_Batch=20', now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))
gamma = 0.8
lam = 1
total_eps = 200
batch_eps = 20

eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 

Value Params -- h1: 3, h2: 3, h3: 5, lr: 0.00577
***** Episode 20, Mean Return = 1328.1, Mean Discounted Return = 5.3 *****
ExplainedVarNew: -0.164
ExplainedVarOld: -0.00479
ValFuncLoss: 19.6


***** Episode 40, Mean Return = 3666.7, Mean Discounted Return = 7.6 *****
ExplainedVarNew: -0.382
ExplainedVarOld: -0.0487
ValFuncLoss: 236


***** Episode 60, Mean Return = 3699.0, Mean Discounted Return = 9.3 *****
ExplainedVarNew: -0.362
ExplainedVarOld: -0.375
ValFuncLoss: 230


***** Episode 80, Mean Return = 3647.8, Mean Discounted Return = 10.4 *****
ExplainedVarNew: -0.378
ExplainedVarOld: -0.373
ValFuncLoss: 226


***** Episode 100, Mean Return = 3832.0, Mean Discounted Return = 8.7 *****
ExplainedVarNew: -0.373
ExplainedVarOld: -0.362
ValFuncLoss: 236


***** Episode 120, Mean Return = 3639.5, Mean Discounted Return = 9.6 *****
ExplainedVarNew: -0.413
ExplainedVarOld: -0.374
ValFuncLoss: 241


***** Episode 140, Mean Return = 3640.5, Mean Discounted Return = 8.1 *****
ExplainedVarNew:

In [None]:
#########################       Continuous State Space      ######################### 
env_name = 'MountainCar-v0'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
act_num = env.action_space.n
upper_bound = env.observation_space.high
lower_bound = env.observation_space.low
discretize_level = 2
sta_dim = env.observation_space.shape[0]
policy = DRPolicyContWass(discretize_level, sta_dim, upper_bound, lower_bound, act_num)
val_func = NNValueFunction(sta_dim, 10)
logger = Logger(logname=env_name, now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))
gamma = 0.8
lam = 1
total_eps = 5000
batch_eps = 50

eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 

Value Params -- h1: 20, h2: 10, h3: 5, lr: 0.00316
***** Episode 50, Mean Return = -200.0, Mean Discounted Return = -5.0 *****
ExplainedVarNew: -0.000193
ExplainedVarOld: -0.0132
ValFuncLoss: 0.212


***** Episode 100, Mean Return = -200.0, Mean Discounted Return = -5.0 *****
ExplainedVarNew: -9.4e-05
ExplainedVarOld: -0.000136
ValFuncLoss: 0.212


***** Episode 150, Mean Return = -200.0, Mean Discounted Return = -5.0 *****
ExplainedVarNew: -4.1e-05
ExplainedVarOld: -9.72e-05
ValFuncLoss: 0.212


***** Episode 200, Mean Return = -200.0, Mean Discounted Return = -5.0 *****
ExplainedVarNew: -1.53e-05
ExplainedVarOld: -3.59e-05
ValFuncLoss: 0.212


***** Episode 250, Mean Return = -200.0, Mean Discounted Return = -5.0 *****
ExplainedVarNew: -5.71e-06
ExplainedVarOld: -1.29e-05
ValFuncLoss: 0.212


***** Episode 300, Mean Return = -200.0, Mean Discounted Return = -5.0 *****
ExplainedVarNew: -3.57e-06
ExplainedVarOld: -8.65e-06
ValFuncLoss: 0.212


***** Episode 350, Mean Return = -200.0, M