In [2]:
import gym
import gym_gridworld
from datetime import datetime

# DR TRPO related files
from grid_train_helper import *
from value import NNValueFunction
from utils import Logger
from grid_dr_policy import DRPolicyKL, DRPolicyWass

# ODRPO - KL

## No human 

In [9]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -7.24e-05
ExplainedVarOld: -0.000102
ValFuncLoss: 52.2


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.01
ExplainedVarOld: -0.0189
ValFuncLoss: 44.2


***** Episode 3, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00108
ExplainedVarOld: -0.00451
ValFuncLoss: 35.5


***** Episode 4, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -5.33e-09
ExplainedVarOld: -2.21e-08
ValFuncLoss: 29.1


***** Episode 5, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -3.52e-08
ExplainedVarOld: -7.09e-08
ValFuncLoss: 24.1


***** Episode 6, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -7.38e-10
ExplainedVarOld: -1.14e-09
ValFuncLoss: 20.1


***** Episode 7, Mean Return = -51.0, Mean Discounted Return = -1

***** Episode 66, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -4.03e-05
ExplainedVarOld: -2.37e-05
ValFuncLoss: 5.3


***** Episode 67, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.35e-09
ExplainedVarOld: -1.94e-10
ValFuncLoss: 5.3


***** Episode 68, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.99e-08
ExplainedVarOld: -5.37e-09
ValFuncLoss: 5.28


***** Episode 69, Mean Return = 77.0, Mean Discounted Return = -0.3 *****
ExplainedVarNew: -1.1e-11
ExplainedVarOld: -2.11e-11
ValFuncLoss: 1.68e+03


***** Episode 70, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -4.22e-08
ExplainedVarOld: -1.16e-07
ValFuncLoss: 150


***** Episode 71, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.441
ExplainedVarOld: -0.236
ValFuncLoss: 47.1


***** Episode 72, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00526
Explai

## Human in the Loop

In [10]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-KL-Human_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
         
        # human modifies the advantage
        # left red room
        all_advantages[0][2] += 1
        all_advantages[8][2] += 1
        all_advantages[16][2] += 1
        all_advantages[24][1] += 1
        all_advantages[32][0] += 1
        all_advantages[40][0] += 1
        all_advantages[48][0] += 1
        
        all_advantages[1][2] += 1
        all_advantages[9][2] += 1
        all_advantages[17][2] += 1
        all_advantages[25][1] += 1
        all_advantages[33][0] += 1
        all_advantages[41][0] += 1
        all_advantages[49][0] += 1
        
        
        # middle path 
        all_advantages[26][1] += 1
        
        # middle blue room
        all_advantages[3][2] += 1
        all_advantages[11][2] += 1
        all_advantages[19][2] += 1
        all_advantages[27][1] += 1
        all_advantages[35][0] += 1
        all_advantages[43][0] += 1
        all_advantages[51][0] += 1
        
        all_advantages[4][2] += 1
        all_advantages[12][2] += 1
        all_advantages[20][2] += 1
        all_advantages[28][1] += 1
        all_advantages[36][0] += 1
        all_advantages[44][0] += 1
        all_advantages[52][0] += 1
        
        # middle path 
        all_advantages[29][1] += 1
        
        
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00214
ExplainedVarOld: -0.00424
ValFuncLoss: 49.2


***** Episode 2, Mean Return = 67.0, Mean Discounted Return = -6.6 *****
ExplainedVarNew: -2.38e-08
ExplainedVarOld: -1.38e-09
ValFuncLoss: 1.39e+03


***** Episode 3, Mean Return = 94.0, Mean Discounted Return = 48.5 *****
ExplainedVarNew: -4.58e-13
ExplainedVarOld: -1.96e-08
ValFuncLoss: 4.93e+03


***** Episode 4, Mean Return = 93.0, Mean Discounted Return = 42.6 *****
ExplainedVarNew: -1.61e-11
ExplainedVarOld: -6.01e-11
ValFuncLoss: 4.35e+03


***** Episode 5, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -2.84e-14
ExplainedVarOld: -1.25e-13
ValFuncLoss: 5.05e+03


***** Episode 6, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.73e-14
ExplainedVarOld: -2.84e-14
ValFuncLoss: 4.93e+03


***** Episode 7, Mean Return = 92.0, Mean Discoun

***** Episode 65, Mean Return = 93.0, Mean Discounted Return = 42.6 *****
ExplainedVarNew: -7.77e-15
ExplainedVarOld: 0
ValFuncLoss: 1.18e+03


***** Episode 66, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.33e-14
ExplainedVarOld: -1.33e-14
ValFuncLoss: 1.51e+03


***** Episode 67, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: -1.33e-14
ValFuncLoss: 1.48e+03


***** Episode 68, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.33e-14
ExplainedVarOld: 2.22e-16
ValFuncLoss: 1.45e+03


***** Episode 69, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.35e-14
ExplainedVarOld: -1.33e-14
ValFuncLoss: 1.42e+03


***** Episode 70, Mean Return = 90.0, Mean Discounted Return = 28.4 *****
ExplainedVarNew: -7.55e-15
ExplainedVarOld: -1.24e-14
ValFuncLoss: 794


***** Episode 71, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.53e-14
Ex

# ODRPO - Wasserstein

## No human 

In [12]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00558
ExplainedVarOld: -0.00245
ValFuncLoss: 58.7


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -2.4e-13
ExplainedVarOld: -1.22e-14
ValFuncLoss: 46.2


***** Episode 3, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -5.23e-05
ExplainedVarOld: -0.000493
ValFuncLoss: 37.8


***** Episode 4, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -4.44e-16
ExplainedVarOld: -4.44e-16
ValFuncLoss: 30.3


***** Episode 5, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.21e-11
ExplainedVarOld: -2.84e-10
ValFuncLoss: 24.8


***** Episode 6, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -9.71e-10
ExplainedVarOld: -6.61e-10
ValFuncLoss: 20.5


***** Episode 7, Mean Return = -51.0, Mean Discounted Return

***** Episode 62, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -3.33e-15
ExplainedVarOld: 2.22e-16
ValFuncLoss: 5.27


***** Episode 63, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.72e-13
ExplainedVarOld: -1.72e-13
ValFuncLoss: 5.27


***** Episode 64, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -4.31e-14
ExplainedVarOld: -4.31e-14
ValFuncLoss: 5.27


***** Episode 65, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.72e-13
ExplainedVarOld: -1.82e-13
ValFuncLoss: 5.27


***** Episode 66, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: 2.22e-16
ValFuncLoss: 5.27


***** Episode 67, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.72e-13
ExplainedVarOld: -4.31e-14
ValFuncLoss: 5.27


***** Episode 68, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: 2.22e-16
E

## Human in the Loop 

In [13]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        
        # human modifies the advantage
        # left red room
        all_advantages[0][2] += 1
        all_advantages[8][2] += 1
        all_advantages[16][2] += 1
        all_advantages[24][1] += 1
        all_advantages[32][0] += 1
        all_advantages[40][0] += 1
        all_advantages[48][0] += 1
        
        all_advantages[1][2] += 1
        all_advantages[9][2] += 1
        all_advantages[17][2] += 1
        all_advantages[25][1] += 1
        all_advantages[33][0] += 1
        all_advantages[41][0] += 1
        all_advantages[49][0] += 1
        
        
        # middle path 
        all_advantages[26][1] += 1
        
        # middle blue room
        all_advantages[3][2] += 1
        all_advantages[11][2] += 1
        all_advantages[19][2] += 1
        all_advantages[27][1] += 1
        all_advantages[35][0] += 1
        all_advantages[43][0] += 1
        all_advantages[51][0] += 1
        
        all_advantages[4][2] += 1
        all_advantages[12][2] += 1
        all_advantages[20][2] += 1
        all_advantages[28][1] += 1
        all_advantages[36][0] += 1
        all_advantages[44][0] += 1
        all_advantages[52][0] += 1
        
        # middle path 
        all_advantages[29][1] += 1
        
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -3.64e-09
ExplainedVarOld: -2.84e-13
ValFuncLoss: 59.6


***** Episode 2, Mean Return = 76.0, Mean Discounted Return = -1.2 *****
ExplainedVarNew: -3.21e-12
ExplainedVarOld: -3.84e-10
ValFuncLoss: 1.81e+03


***** Episode 3, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -6.15e-10
ExplainedVarOld: -2e-15
ValFuncLoss: 5.3e+03


***** Episode 4, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -2.37e-12
ExplainedVarOld: -6.15e-10
ValFuncLoss: 5.15e+03


***** Episode 5, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.19e-12
ExplainedVarOld: -2.37e-12
ValFuncLoss: 5.02e+03


***** Episode 6, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -6.31e-13
ExplainedVarOld: -1.19e-12
ValFuncLoss: 4.9e+03


***** Episode 7, Mean Return = 95.0, Mean Discounted

***** Episode 58, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.22e-15
ExplainedVarOld: -1.35e-14
ValFuncLoss: 1.94e+03


***** Episode 59, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -2.89e-14
ExplainedVarOld: -8.22e-15
ValFuncLoss: 1.9e+03


***** Episode 60, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.33e-14
ExplainedVarOld: -2.89e-14
ValFuncLoss: 1.87e+03


***** Episode 61, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.33e-14
ExplainedVarOld: -1.33e-14
ValFuncLoss: 1.84e+03


***** Episode 62, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.22e-15
ExplainedVarOld: -1.33e-14
ValFuncLoss: 1.8e+03


***** Episode 63, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.35e-14
ExplainedVarOld: -8.22e-15
ValFuncLoss: 1.77e+03


***** Episode 64, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: