In [None]:
import gym
import pybullet
import pybullet_envs
from gym import wrappers
from datetime import datetime

# DR TRPO related files
from train_helper import *
from value import NNValueFunction
from utils import Logger
from dr_policy import DRPolicyKL, DRPolicyWass

In [None]:
#########################       Discrete State Space - KL DR Policy     ######################### 
# e.g. 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

env_name = 'Taxi-v3'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 3000
batch_eps = 100
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        policy.update(observes, actions, advantages, disc_freqs)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 100, Mean Return = -772.1, Mean Discounted Return = -41.1 *****
ExplainedVarNew: -3.59e-10
ExplainedVarOld: -5.73e-06
ValFuncLoss: 523


***** Episode 200, Mean Return = -670.1, Mean Discounted Return = -36.8 *****
ExplainedVarNew: -7.82e-11
ExplainedVarOld: -5.44e-10
ValFuncLoss: 120


***** Episode 300, Mean Return = -596.3, Mean Discounted Return = -30.0 *****
ExplainedVarNew: -4.17e-10
ExplainedVarOld: -1.31e-10
ValFuncLoss: 99.6


***** Episode 400, Mean Return = -484.5, Mean Discounted Return = -26.4 *****
ExplainedVarNew: -1.17e-06
ExplainedVarOld: -4.79e-11
ValFuncLoss: 93


***** Episode 500, Mean Return = -381.1, Mean Discounted Return = -22.1 *****
ExplainedVarNew: -0.00173
ExplainedVarOld: -2.88e-06
ValFuncLoss: 83.8


***** Episode 600, Mean Return = -317.2, Mean Discounted Return = -19.1 *****
ExplainedVarNew: -0.000717
ExplainedVarOld: -0.00608
ValFuncLoss: 69


***** Episode 700, Mean Return = -246.5, Mean 

In [4]:
#########################       Discrete State Space - Wasserstein DR Policy     ######################### 
# e.g. 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

env_name = 'Taxi-v3'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.8
lam = 1
total_eps = 5000
batch_eps = 100
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages, disc_freqs)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 100, Mean Return = -775.6, Mean Discounted Return = -19.5 *****
ExplainedVarNew: -2.01e-09
ExplainedVarOld: -8.33e-07
ValFuncLoss: 69.3


***** Episode 200, Mean Return = -704.6, Mean Discounted Return = -17.7 *****
ExplainedVarNew: -0.0157
ExplainedVarOld: -1.83e-08
ValFuncLoss: 59.2


***** Episode 300, Mean Return = -702.5, Mean Discounted Return = -17.6 *****
ExplainedVarNew: -0.0169
ExplainedVarOld: -0.0153
ValFuncLoss: 61


***** Episode 400, Mean Return = -564.7, Mean Discounted Return = -16.3 *****
ExplainedVarNew: -0.0802
ExplainedVarOld: -0.0744
ValFuncLoss: 66.5


***** Episode 500, Mean Return = -570.7, Mean Discounted Return = -16.0 *****
ExplainedVarNew: -0.0348
ExplainedVarOld: -0.0477
ValFuncLoss: 61.2


***** Episode 600, Mean Return = -506.5, Mean Discounted Return = -14.5 *****
ExplainedVarNew: -0.0678
ExplainedVarOld: -0.0826
ValFuncLoss: 59.9


***** Episode 700, Mean Return = -408.4, Mean Discounted R