In [2]:
import gym
import pybullet
import pybullet_envs
from gym import wrappers
from datetime import datetime

# DR TRPO related files
from train_helper import *
from value import NNValueFunction
from utils import Logger
from dr_policy import DRPolicyKL, DRPolicyWass

In [3]:
#########################     Discrete State Space - KL DR Policy     ######################### 
# 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

env_name = 'NChain-v0'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 2000
batch_eps = 20
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        policy.update(observes, actions, advantages, disc_freqs)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 20, Mean Return = 1311.1, Mean Discounted Return = 13.3 *****
ExplainedVarNew: -7.37e-07
ExplainedVarOld: -0.000309
ValFuncLoss: 28.8


***** Episode 40, Mean Return = 1619.2, Mean Discounted Return = 14.7 *****
ExplainedVarNew: -0.0605
ExplainedVarOld: -2.53e-07
ValFuncLoss: 93.2


***** Episode 60, Mean Return = 2023.8, Mean Discounted Return = 14.5 *****
ExplainedVarNew: -0.212
ExplainedVarOld: -0.0395
ValFuncLoss: 210


***** Episode 80, Mean Return = 2543.3, Mean Discounted Return = 18.8 *****
ExplainedVarNew: -0.28
ExplainedVarOld: -0.176
ValFuncLoss: 344


***** Episode 100, Mean Return = 2997.3, Mean Discounted Return = 21.3 *****
ExplainedVarNew: -0.293
ExplainedVarOld: -0.265
ValFuncLoss: 419


***** Episode 120, Mean Return = 3205.1, Mean Discounted Return = 20.3 *****
ExplainedVarNew: -0.271
ExplainedVarOld: -0.287
ValFuncLoss: 435


***** Episode 140, Mean Return = 3298.6, Mean Discounted Return = 20.6 *****
E

***** Episode 1160, Mean Return = 3697.3, Mean Discounted Return = 24.4 *****
ExplainedVarNew: -0.207
ExplainedVarOld: -0.205
ValFuncLoss: 440


***** Episode 1180, Mean Return = 3796.0, Mean Discounted Return = 26.0 *****
ExplainedVarNew: -0.197
ExplainedVarOld: -0.185
ValFuncLoss: 490


***** Episode 1200, Mean Return = 3590.7, Mean Discounted Return = 24.1 *****
ExplainedVarNew: -0.225
ExplainedVarOld: -0.231
ValFuncLoss: 426


***** Episode 1220, Mean Return = 3630.9, Mean Discounted Return = 19.5 *****
ExplainedVarNew: -0.191
ExplainedVarOld: -0.21
ValFuncLoss: 443


***** Episode 1240, Mean Return = 3623.6, Mean Discounted Return = 30.2 *****
ExplainedVarNew: -0.215
ExplainedVarOld: -0.201
ValFuncLoss: 427


***** Episode 1260, Mean Return = 3739.9, Mean Discounted Return = 28.6 *****
ExplainedVarNew: -0.197
ExplainedVarOld: -0.208
ValFuncLoss: 440


***** Episode 1280, Mean Return = 3684.7, Mean Discounted Return = 25.6 *****
ExplainedVarNew: -0.217
ExplainedVarOld: -0.195
ValFu

In [None]:
####################   Discrete State Space - Wasserstein DR Policy     ##################### 
#'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0', 'FrozenLake8x8-v0'

env_name = 'NChain-v0'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.8
lam = 1
total_eps = 1500
batch_eps = 15
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages, disc_freqs)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 10, Mean Return = 1272.8, Mean Discounted Return = 4.9 *****
ExplainedVarNew: -0.000206
ExplainedVarOld: -0.000441
ValFuncLoss: 12.3


***** Episode 20, Mean Return = 2583.6, Mean Discounted Return = 5.7 *****
ExplainedVarNew: -0.226
ExplainedVarOld: -2.12e-05
ValFuncLoss: 158


***** Episode 30, Mean Return = 2389.4, Mean Discounted Return = 6.0 *****
ExplainedVarNew: -0.482
ExplainedVarOld: -0.235
ValFuncLoss: 161


***** Episode 40, Mean Return = 2528.8, Mean Discounted Return = 8.8 *****
ExplainedVarNew: -0.491
ExplainedVarOld: -0.456
ValFuncLoss: 182


***** Episode 50, Mean Return = 2629.2, Mean Discounted Return = 7.7 *****
ExplainedVarNew: -0.504
ExplainedVarOld: -0.48
ValFuncLoss: 193


***** Episode 60, Mean Return = 2423.6, Mean Discounted Return = 6.4 *****
ExplainedVarNew: -0.518
ExplainedVarOld: -0.546
ValFuncLoss: 167


***** Episode 70, Mean Return = 2473.0, Mean Discounted Return = 6.9 *****
ExplainedVarNe

***** Episode 580, Mean Return = 3415.2, Mean Discounted Return = 10.9 *****
ExplainedVarNew: -0.415
ExplainedVarOld: -0.417
ValFuncLoss: 221


***** Episode 590, Mean Return = 3463.6, Mean Discounted Return = 9.4 *****
ExplainedVarNew: -0.414
ExplainedVarOld: -0.448
ValFuncLoss: 205


***** Episode 600, Mean Return = 3521.0, Mean Discounted Return = 9.0 *****
ExplainedVarNew: -0.388
ExplainedVarOld: -0.376
ValFuncLoss: 221


***** Episode 610, Mean Return = 3420.4, Mean Discounted Return = 9.9 *****
ExplainedVarNew: -0.424
ExplainedVarOld: -0.422
ValFuncLoss: 208


***** Episode 620, Mean Return = 3531.8, Mean Discounted Return = 8.8 *****
ExplainedVarNew: -0.388
ExplainedVarOld: -0.376
ValFuncLoss: 231


***** Episode 630, Mean Return = 1846.2, Mean Discounted Return = 5.5 *****
ExplainedVarNew: -0.564
ExplainedVarOld: -0.472
ValFuncLoss: 117


***** Episode 640, Mean Return = 2107.0, Mean Discounted Return = 4.6 *****
ExplainedVarNew: -0.502
ExplainedVarOld: -0.485
ValFuncLoss: 153
