In [2]:
import gym
import gym_gridworld
from datetime import datetime

# DR TRPO related files
from grid_train_helper import *
from value import NNValueFunction
from utils import Logger
from grid_dr_policy import DRPolicyKL, DRPolicyWass

# ODRPO - KL

## No human 

In [6]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.034
ExplainedVarOld: -0.022
ValFuncLoss: 53


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.54e-08
ExplainedVarOld: -8.11e-07
ValFuncLoss: 38.7


***** Episode 3, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -3.63e-06
ExplainedVarOld: -3.49e-06
ValFuncLoss: 32.3


***** Episode 4, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00537
ExplainedVarOld: -0.0075
ValFuncLoss: 26.9


***** Episode 5, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00138
ExplainedVarOld: -0.00292
ValFuncLoss: 21.4


***** Episode 6, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -8.95e-09
ExplainedVarOld: -2.77e-08
ValFuncLoss: 17.3


***** Episode 7, Mean Return = -51.0, Mean Discounted Return = -10.0 *

***** Episode 60, Mean Return = 94.0, Mean Discounted Return = 48.5 *****
ExplainedVarNew: -7.13e-13
ExplainedVarOld: -8.54e-13
ValFuncLoss: 1.73e+03


***** Episode 61, Mean Return = 89.0, Mean Discounted Return = 24.5 *****
ExplainedVarNew: -2.5e-10
ExplainedVarOld: -2.7e-10
ValFuncLoss: 1e+03


***** Episode 62, Mean Return = 87.0, Mean Discounted Return = 18.0 *****
ExplainedVarNew: -9.1e-11
ExplainedVarOld: -9.5e-11
ValFuncLoss: 879


***** Episode 63, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -4.86e-13
ExplainedVarOld: -5.88e-13
ValFuncLoss: 1.83e+03


***** Episode 64, Mean Return = 77.0, Mean Discounted Return = -0.3 *****
ExplainedVarNew: -1.1e-13
ExplainedVarOld: -9.79e-14
ValFuncLoss: 874


***** Episode 65, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -4.71e-13
ExplainedVarOld: -5.86e-13
ValFuncLoss: 1.8e+03


***** Episode 66, Mean Return = 94.0, Mean Discounted Return = 48.5 *****
ExplainedVarNew: -4.64e-13
Explai

## Human in the Loop

In [7]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-KL-Human_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
         
        # human modifies the advantage
        # left red room
        all_advantages[0][2] += 1
        all_advantages[8][2] += 1
        all_advantages[16][2] += 1
        all_advantages[24][1] += 1
        all_advantages[32][0] += 1
        all_advantages[40][0] += 1
        all_advantages[48][0] += 1
        
        all_advantages[1][2] += 1
        all_advantages[9][2] += 1
        all_advantages[17][2] += 1
        all_advantages[25][1] += 1
        all_advantages[33][0] += 1
        all_advantages[41][0] += 1
        all_advantages[49][0] += 1
        
        
        # middle path 
        all_advantages[26][1] += 1
        
        # middle blue room
        all_advantages[3][2] += 1
        all_advantages[11][2] += 1
        all_advantages[19][2] += 1
        all_advantages[27][1] += 1
        all_advantages[35][0] += 1
        all_advantages[43][0] += 1
        all_advantages[51][0] += 1
        
        all_advantages[4][2] += 1
        all_advantages[12][2] += 1
        all_advantages[20][2] += 1
        all_advantages[28][1] += 1
        all_advantages[36][0] += 1
        all_advantages[44][0] += 1
        all_advantages[52][0] += 1
        
        # middle path 
        all_advantages[29][1] += 1
        
        
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.57e-07
ExplainedVarOld: -2.78e-06
ValFuncLoss: 61.7


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.0255
ExplainedVarOld: -0.00194
ValFuncLoss: 41.4


***** Episode 3, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.0038
ExplainedVarOld: -0.00523
ValFuncLoss: 27.5


***** Episode 4, Mean Return = 63.0, Mean Discounted Return = -7.8 *****
ExplainedVarNew: -2.91e-11
ExplainedVarOld: -1.37e-13
ValFuncLoss: 1.33e+03


***** Episode 5, Mean Return = 93.0, Mean Discounted Return = 42.6 *****
ExplainedVarNew: -6.75e-14
ExplainedVarOld: -2.2e-11
ValFuncLoss: 4.64e+03


***** Episode 6, Mean Return = 91.0, Mean Discounted Return = 32.6 *****
ExplainedVarNew: -2.22e-16
ExplainedVarOld: -1.78e-15
ValFuncLoss: 3.81e+03


***** Episode 7, Mean Return = 91.0, Mean Discounted Retur

***** Episode 56, Mean Return = 93.0, Mean Discounted Return = 42.6 *****
ExplainedVarNew: -2.22e-16
ExplainedVarOld: -9.77e-15
ValFuncLoss: 1.4e+03


***** Episode 57, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: 2.22e-16
ValFuncLoss: 1.78e+03


***** Episode 58, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.51e-14
ExplainedVarOld: 2.22e-16
ValFuncLoss: 1.75e+03


***** Episode 59, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -2.89e-14
ExplainedVarOld: -1.51e-14
ValFuncLoss: 1.71e+03


***** Episode 60, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: -2.89e-14
ValFuncLoss: 1.68e+03


***** Episode 61, Mean Return = 94.0, Mean Discounted Return = 48.5 *****
ExplainedVarNew: 0
ExplainedVarOld: 0
ValFuncLoss: 1.43e+03


***** Episode 62, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.35e-14
Explaine

# ODRPO - Wasserstein

## No human 

In [10]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -4.46e-08
ExplainedVarOld: -4.53e-08
ValFuncLoss: 57


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.0327
ExplainedVarOld: -0.018
ValFuncLoss: 40.8


***** Episode 3, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00999
ExplainedVarOld: -0.0373
ValFuncLoss: 30.9


***** Episode 4, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -4.8e-10
ExplainedVarOld: -6.8e-09
ValFuncLoss: 19.3


***** Episode 5, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.000582
ExplainedVarOld: -0.00133
ValFuncLoss: 15.7


***** Episode 6, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.000442
ExplainedVarOld: -0.00107
ValFuncLoss: 11.8


***** Episode 7, Mean Return = -51.0, Mean Discounted Return = -10.0 **

***** Episode 63, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -6.13e-12
ExplainedVarOld: -6.13e-12
ValFuncLoss: 5.27


***** Episode 64, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.09e-11
ExplainedVarOld: -1.07e-11
ValFuncLoss: 5.27


***** Episode 65, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.14e-09
ExplainedVarOld: -1.11e-09
ValFuncLoss: 5.27


***** Episode 66, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -4.22e-14
ExplainedVarOld: -4.22e-14
ValFuncLoss: 5.27


***** Episode 67, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -7.36e-12
ExplainedVarOld: -7.07e-12
ValFuncLoss: 5.27


***** Episode 68, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.29e-11
ExplainedVarOld: -1.29e-11
ValFuncLoss: 5.27


***** Episode 69, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.0001

## Human in the Loop 

In [9]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        
        # human modifies the advantage
        # left red room
        all_advantages[0][2] += 1
        all_advantages[8][2] += 1
        all_advantages[16][2] += 1
        all_advantages[24][1] += 1
        all_advantages[32][0] += 1
        all_advantages[40][0] += 1
        all_advantages[48][0] += 1
        
        all_advantages[1][2] += 1
        all_advantages[9][2] += 1
        all_advantages[17][2] += 1
        all_advantages[25][1] += 1
        all_advantages[33][0] += 1
        all_advantages[41][0] += 1
        all_advantages[49][0] += 1
        
        
        # middle path 
        all_advantages[26][1] += 1
        
        # middle blue room
        all_advantages[3][2] += 1
        all_advantages[11][2] += 1
        all_advantages[19][2] += 1
        all_advantages[27][1] += 1
        all_advantages[35][0] += 1
        all_advantages[43][0] += 1
        all_advantages[51][0] += 1
        
        all_advantages[4][2] += 1
        all_advantages[12][2] += 1
        all_advantages[20][2] += 1
        all_advantages[28][1] += 1
        all_advantages[36][0] += 1
        all_advantages[44][0] += 1
        all_advantages[52][0] += 1
        
        # middle path 
        all_advantages[29][1] += 1
        
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00135
ExplainedVarOld: -0.00119
ValFuncLoss: 62.9


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.15e-14
ExplainedVarOld: -2.66e-12
ValFuncLoss: 44.2


***** Episode 3, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -2.22e-16
ExplainedVarOld: -2.22e-16
ValFuncLoss: 6.2e+03


***** Episode 4, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: -2.22e-16
ValFuncLoss: 5.73e+03


***** Episode 5, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 0
ExplainedVarOld: 2.22e-16
ValFuncLoss: 5.48e+03


***** Episode 6, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: 0
ValFuncLoss: 5.31e+03


***** Episode 7, Mean Return = 95.0, Mean Discounted Return = 55.0 ****

***** Episode 59, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.44e-15
ExplainedVarOld: 2.22e-16
ValFuncLoss: 2.01e+03


***** Episode 60, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.44e-15
ExplainedVarOld: -8.44e-15
ValFuncLoss: 1.97e+03


***** Episode 61, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: -8.44e-15
ValFuncLoss: 1.93e+03


***** Episode 62, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.22e-15
ExplainedVarOld: 2.22e-16
ValFuncLoss: 1.9e+03


***** Episode 63, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: -8.22e-15
ValFuncLoss: 1.86e+03


***** Episode 64, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: 2.22e-16
ValFuncLoss: 1.83e+03


***** Episode 65, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.4