In [None]:
import gym
import pybullet
import pybullet_envs
from gym import wrappers
from datetime import datetime

# DR TRPO related files
from train_helper import *
from value import NNValueFunction
from utils import Logger
from dr_policy import DRPolicy, DRPolicyWass
from dr_policy_cont import DRPolicyCont, DRPolicyContWass

In [None]:
#########################       Discrete State Space - KL DR Policy     ######################### 
# e.g. 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0'

env_name = 'NChain-v0'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicy(sta_num, act_num)
val_func = NNValueFunction(1, 3)
logger = Logger(logname=env_name + '_DR-KL_Batch=20', now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))
gamma = 0.8
lam = 1
total_eps = 500
batch_eps = 20

eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 

In [None]:
#########################       Discrete State Space - Wasserstein DR Policy     ######################### 
# e.g. 'Taxi-v3', 'Roulette-v0', 'NChain-v0', 'FrozenLake-v0', 'CliffWalking-v0'

env_name = 'NChain-v0'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 3)
logger = Logger(logname=env_name + '_DR-Wass_Batch=20', now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))
gamma = 0.8
lam = 1
total_eps = 200
batch_eps = 20

eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 

In [None]:
#########################       Continuous State Space      ######################### 
env_name = 'MountainCar-v0'
pybullet.connect(pybullet.DIRECT)
env = gym.make(env_name)
act_num = env.action_space.n
upper_bound = env.observation_space.high
lower_bound = env.observation_space.low
discretize_level = 2
sta_dim = env.observation_space.shape[0]
policy = DRPolicyContWass(discretize_level, sta_dim, upper_bound, lower_bound, act_num)
val_func = NNValueFunction(sta_dim, 10)
logger = Logger(logname=env_name, now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))
gamma = 0.8
lam = 1
total_eps = 5000
batch_eps = 50

eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        policy.update(observes, actions, advantages)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 