In [2]:
import gym
import gym_gridworld
from datetime import datetime

# DR TRPO related files
from grid_train_helper import *
from value import NNValueFunction
from utils import Logger
from grid_dr_policy import DRPolicyKL, DRPolicyWass

# Move to Yellow Room

## ODRPO - KL No human 

In [6]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.034
ExplainedVarOld: -0.022
ValFuncLoss: 53


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.54e-08
ExplainedVarOld: -8.11e-07
ValFuncLoss: 38.7


***** Episode 3, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -3.63e-06
ExplainedVarOld: -3.49e-06
ValFuncLoss: 32.3


***** Episode 4, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00537
ExplainedVarOld: -0.0075
ValFuncLoss: 26.9


***** Episode 5, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00138
ExplainedVarOld: -0.00292
ValFuncLoss: 21.4


***** Episode 6, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -8.95e-09
ExplainedVarOld: -2.77e-08
ValFuncLoss: 17.3


***** Episode 7, Mean Return = -51.0, Mean Discounted Return = -10.0 *

***** Episode 60, Mean Return = 94.0, Mean Discounted Return = 48.5 *****
ExplainedVarNew: -7.13e-13
ExplainedVarOld: -8.54e-13
ValFuncLoss: 1.73e+03


***** Episode 61, Mean Return = 89.0, Mean Discounted Return = 24.5 *****
ExplainedVarNew: -2.5e-10
ExplainedVarOld: -2.7e-10
ValFuncLoss: 1e+03


***** Episode 62, Mean Return = 87.0, Mean Discounted Return = 18.0 *****
ExplainedVarNew: -9.1e-11
ExplainedVarOld: -9.5e-11
ValFuncLoss: 879


***** Episode 63, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -4.86e-13
ExplainedVarOld: -5.88e-13
ValFuncLoss: 1.83e+03


***** Episode 64, Mean Return = 77.0, Mean Discounted Return = -0.3 *****
ExplainedVarNew: -1.1e-13
ExplainedVarOld: -9.79e-14
ValFuncLoss: 874


***** Episode 65, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -4.71e-13
ExplainedVarOld: -5.86e-13
ValFuncLoss: 1.8e+03


***** Episode 66, Mean Return = 94.0, Mean Discounted Return = 48.5 *****
ExplainedVarNew: -4.64e-13
Explai

## ODRPO - KL Human in the Loop

In [7]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-KL-Human_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
         
        # human modifies the advantage
        # left red room
        all_advantages[0][2] += 1
        all_advantages[8][2] += 1
        all_advantages[16][2] += 1
        all_advantages[24][1] += 1
        all_advantages[32][0] += 1
        all_advantages[40][0] += 1
        all_advantages[48][0] += 1
        
        all_advantages[1][2] += 1
        all_advantages[9][2] += 1
        all_advantages[17][2] += 1
        all_advantages[25][1] += 1
        all_advantages[33][0] += 1
        all_advantages[41][0] += 1
        all_advantages[49][0] += 1
        
        
        # middle path 
        all_advantages[26][1] += 1
        
        # middle blue room
        all_advantages[3][2] += 1
        all_advantages[11][2] += 1
        all_advantages[19][2] += 1
        all_advantages[27][1] += 1
        all_advantages[35][0] += 1
        all_advantages[43][0] += 1
        all_advantages[51][0] += 1
        
        all_advantages[4][2] += 1
        all_advantages[12][2] += 1
        all_advantages[20][2] += 1
        all_advantages[28][1] += 1
        all_advantages[36][0] += 1
        all_advantages[44][0] += 1
        all_advantages[52][0] += 1
        
        # middle path 
        all_advantages[29][1] += 1
        
        
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.57e-07
ExplainedVarOld: -2.78e-06
ValFuncLoss: 61.7


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.0255
ExplainedVarOld: -0.00194
ValFuncLoss: 41.4


***** Episode 3, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.0038
ExplainedVarOld: -0.00523
ValFuncLoss: 27.5


***** Episode 4, Mean Return = 63.0, Mean Discounted Return = -7.8 *****
ExplainedVarNew: -2.91e-11
ExplainedVarOld: -1.37e-13
ValFuncLoss: 1.33e+03


***** Episode 5, Mean Return = 93.0, Mean Discounted Return = 42.6 *****
ExplainedVarNew: -6.75e-14
ExplainedVarOld: -2.2e-11
ValFuncLoss: 4.64e+03


***** Episode 6, Mean Return = 91.0, Mean Discounted Return = 32.6 *****
ExplainedVarNew: -2.22e-16
ExplainedVarOld: -1.78e-15
ValFuncLoss: 3.81e+03


***** Episode 7, Mean Return = 91.0, Mean Discounted Retur

***** Episode 56, Mean Return = 93.0, Mean Discounted Return = 42.6 *****
ExplainedVarNew: -2.22e-16
ExplainedVarOld: -9.77e-15
ValFuncLoss: 1.4e+03


***** Episode 57, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: 2.22e-16
ValFuncLoss: 1.78e+03


***** Episode 58, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.51e-14
ExplainedVarOld: 2.22e-16
ValFuncLoss: 1.75e+03


***** Episode 59, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -2.89e-14
ExplainedVarOld: -1.51e-14
ValFuncLoss: 1.71e+03


***** Episode 60, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: -2.89e-14
ValFuncLoss: 1.68e+03


***** Episode 61, Mean Return = 94.0, Mean Discounted Return = 48.5 *****
ExplainedVarNew: 0
ExplainedVarOld: 0
ValFuncLoss: 1.43e+03


***** Episode 62, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.35e-14
Explaine

## ODRPO - Wasserstein No human 

In [10]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -4.46e-08
ExplainedVarOld: -4.53e-08
ValFuncLoss: 57


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.0327
ExplainedVarOld: -0.018
ValFuncLoss: 40.8


***** Episode 3, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00999
ExplainedVarOld: -0.0373
ValFuncLoss: 30.9


***** Episode 4, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -4.8e-10
ExplainedVarOld: -6.8e-09
ValFuncLoss: 19.3


***** Episode 5, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.000582
ExplainedVarOld: -0.00133
ValFuncLoss: 15.7


***** Episode 6, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.000442
ExplainedVarOld: -0.00107
ValFuncLoss: 11.8


***** Episode 7, Mean Return = -51.0, Mean Discounted Return = -10.0 **

***** Episode 63, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -6.13e-12
ExplainedVarOld: -6.13e-12
ValFuncLoss: 5.27


***** Episode 64, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.09e-11
ExplainedVarOld: -1.07e-11
ValFuncLoss: 5.27


***** Episode 65, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.14e-09
ExplainedVarOld: -1.11e-09
ValFuncLoss: 5.27


***** Episode 66, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -4.22e-14
ExplainedVarOld: -4.22e-14
ValFuncLoss: 5.27


***** Episode 67, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -7.36e-12
ExplainedVarOld: -7.07e-12
ValFuncLoss: 5.27


***** Episode 68, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.29e-11
ExplainedVarOld: -1.29e-11
ValFuncLoss: 5.27


***** Episode 69, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.0001

## ODRPO - Wasserstein Human in the Loop

In [9]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        
        # human modifies the advantage
        # left red room
        all_advantages[0][2] += 1
        all_advantages[8][2] += 1
        all_advantages[16][2] += 1
        all_advantages[24][1] += 1
        all_advantages[32][0] += 1
        all_advantages[40][0] += 1
        all_advantages[48][0] += 1
        
        all_advantages[1][2] += 1
        all_advantages[9][2] += 1
        all_advantages[17][2] += 1
        all_advantages[25][1] += 1
        all_advantages[33][0] += 1
        all_advantages[41][0] += 1
        all_advantages[49][0] += 1
        
        
        # middle path 
        all_advantages[26][1] += 1
        
        # middle blue room
        all_advantages[3][2] += 1
        all_advantages[11][2] += 1
        all_advantages[19][2] += 1
        all_advantages[27][1] += 1
        all_advantages[35][0] += 1
        all_advantages[43][0] += 1
        all_advantages[51][0] += 1
        
        all_advantages[4][2] += 1
        all_advantages[12][2] += 1
        all_advantages[20][2] += 1
        all_advantages[28][1] += 1
        all_advantages[36][0] += 1
        all_advantages[44][0] += 1
        all_advantages[52][0] += 1
        
        # middle path 
        all_advantages[29][1] += 1
        
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00135
ExplainedVarOld: -0.00119
ValFuncLoss: 62.9


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.15e-14
ExplainedVarOld: -2.66e-12
ValFuncLoss: 44.2


***** Episode 3, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -2.22e-16
ExplainedVarOld: -2.22e-16
ValFuncLoss: 6.2e+03


***** Episode 4, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: -2.22e-16
ValFuncLoss: 5.73e+03


***** Episode 5, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 0
ExplainedVarOld: 2.22e-16
ValFuncLoss: 5.48e+03


***** Episode 6, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: 0
ValFuncLoss: 5.31e+03


***** Episode 7, Mean Return = 95.0, Mean Discounted Return = 55.0 ****

***** Episode 59, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.44e-15
ExplainedVarOld: 2.22e-16
ValFuncLoss: 2.01e+03


***** Episode 60, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.44e-15
ExplainedVarOld: -8.44e-15
ValFuncLoss: 1.97e+03


***** Episode 61, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: -8.44e-15
ValFuncLoss: 1.93e+03


***** Episode 62, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.22e-15
ExplainedVarOld: 2.22e-16
ValFuncLoss: 1.9e+03


***** Episode 63, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: -8.22e-15
ValFuncLoss: 1.86e+03


***** Episode 64, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: 2.22e-16
ValFuncLoss: 1.83e+03


***** Episode 65, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.4

# Chain

## ODRPO - No human 

In [28]:
env_name = 'NChain-v0'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 200
batch_eps = 1
max_steps = 1000
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 1, Mean Return = 1454.0, Mean Discounted Return = 14.6 *****
ExplainedVarNew: -0.00191
ExplainedVarOld: -0.00343
ValFuncLoss: 195


***** Episode 2, Mean Return = 2668.0, Mean Discounted Return = 21.3 *****
ExplainedVarNew: -1.59e-05
ExplainedVarOld: -0.000406
ValFuncLoss: 710


***** Episode 3, Mean Return = 2850.0, Mean Discounted Return = 12.4 *****
ExplainedVarNew: -1.57e-06
ExplainedVarOld: -1.52e-05
ValFuncLoss: 698


***** Episode 4, Mean Return = 3110.0, Mean Discounted Return = 27.0 *****
ExplainedVarNew: -2.83e-07
ExplainedVarOld: -9.9e-07
ValFuncLoss: 866


***** Episode 5, Mean Return = 3154.0, Mean Discounted Return = 23.4 *****
ExplainedVarNew: -1.59e-07
ExplainedVarOld: -3.54e-07
ValFuncLoss: 728


***** Episode 6, Mean Return = 3180.0, Mean Discounted Return = 30.0 *****
ExplainedVarNew: -8.25e-08
ExplainedVarOld: -1.51e-07
ValFuncLoss: 679


***** Episode 7, Mean Return = 2890.0, Mean Discounted Return = 7

***** Episode 57, Mean Return = 3132.0, Mean Discounted Return = 16.8 *****
ExplainedVarNew: -0.203
ExplainedVarOld: -0.199
ValFuncLoss: 475


***** Episode 58, Mean Return = 3266.0, Mean Discounted Return = 25.0 *****
ExplainedVarNew: -0.259
ExplainedVarOld: -0.23
ValFuncLoss: 447


***** Episode 59, Mean Return = 2964.0, Mean Discounted Return = 17.4 *****
ExplainedVarNew: -0.259
ExplainedVarOld: -0.246
ValFuncLoss: 447


***** Episode 60, Mean Return = 3424.0, Mean Discounted Return = 37.6 *****
ExplainedVarNew: -0.224
ExplainedVarOld: -0.276
ValFuncLoss: 438


***** Episode 61, Mean Return = 2506.0, Mean Discounted Return = 22.6 *****
ExplainedVarNew: -0.416
ExplainedVarOld: -0.365
ValFuncLoss: 277


***** Episode 62, Mean Return = 3154.0, Mean Discounted Return = 23.6 *****
ExplainedVarNew: -0.255
ExplainedVarOld: -0.236
ValFuncLoss: 472


***** Episode 63, Mean Return = 2584.0, Mean Discounted Return = 24.8 *****
ExplainedVarNew: -0.468
ExplainedVarOld: -0.415
ValFuncLoss: 305




***** Episode 115, Mean Return = 3174.0, Mean Discounted Return = 57.8 *****
ExplainedVarNew: -0.366
ExplainedVarOld: -0.387
ValFuncLoss: 350


***** Episode 116, Mean Return = 3530.0, Mean Discounted Return = 14.1 *****
ExplainedVarNew: -0.211
ExplainedVarOld: -0.265
ValFuncLoss: 442


***** Episode 117, Mean Return = 3070.0, Mean Discounted Return = 27.0 *****
ExplainedVarNew: -0.233
ExplainedVarOld: -0.201
ValFuncLoss: 450


***** Episode 118, Mean Return = 2986.0, Mean Discounted Return = 43.3 *****
ExplainedVarNew: -0.367
ExplainedVarOld: -0.265
ValFuncLoss: 433


***** Episode 119, Mean Return = 3126.0, Mean Discounted Return = 11.7 *****
ExplainedVarNew: -0.424
ExplainedVarOld: -0.465
ValFuncLoss: 362


***** Episode 120, Mean Return = 2800.0, Mean Discounted Return = 20.7 *****
ExplainedVarNew: -0.328
ExplainedVarOld: -0.435
ValFuncLoss: 316


***** Episode 121, Mean Return = 2746.0, Mean Discounted Return = 15.9 *****
ExplainedVarNew: -0.333
ExplainedVarOld: -0.305
ValFuncLoss

***** Episode 173, Mean Return = 2796.0, Mean Discounted Return = 13.3 *****
ExplainedVarNew: -0.415
ExplainedVarOld: -0.352
ValFuncLoss: 329


***** Episode 174, Mean Return = 2754.0, Mean Discounted Return = 13.6 *****
ExplainedVarNew: -0.243
ExplainedVarOld: -0.347
ValFuncLoss: 341


***** Episode 175, Mean Return = 2616.0, Mean Discounted Return = 13.3 *****
ExplainedVarNew: -0.27
ExplainedVarOld: -0.248
ValFuncLoss: 319


***** Episode 176, Mean Return = 2312.0, Mean Discounted Return = 11.9 *****
ExplainedVarNew: -0.306
ExplainedVarOld: -0.351
ValFuncLoss: 252


***** Episode 177, Mean Return = 3144.0, Mean Discounted Return = 22.4 *****
ExplainedVarNew: -0.211
ExplainedVarOld: -0.202
ValFuncLoss: 419


***** Episode 178, Mean Return = 3318.0, Mean Discounted Return = 13.7 *****
ExplainedVarNew: -0.253
ExplainedVarOld: -0.19
ValFuncLoss: 481


***** Episode 179, Mean Return = 2876.0, Mean Discounted Return = 12.5 *****
ExplainedVarNew: -0.333
ExplainedVarOld: -0.301
ValFuncLoss: 

## ODRPO - Human in the Loop

In [29]:
env_name = 'NChain-v0'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 200
batch_eps = 1
max_steps = 1000
logger = Logger(logname=env_name + '_DR-Wass-Human_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        
        # human modifies the advantage
        all_advantages[0][0] += 0.1
        all_advantages[1][0] += 0.1
        all_advantages[2][0] += 0.1
        all_advantages[3][0] += 0.1
        all_advantages[4][0] += 0.1
        
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 1, Mean Return = 1432.0, Mean Discounted Return = 11.5 *****
ExplainedVarNew: -0.00224
ExplainedVarOld: -0.00209
ValFuncLoss: 189


***** Episode 2, Mean Return = 3004.0, Mean Discounted Return = 46.4 *****
ExplainedVarNew: -5.79e-06
ExplainedVarOld: -0.00025
ValFuncLoss: 978


***** Episode 3, Mean Return = 2626.0, Mean Discounted Return = 10.4 *****
ExplainedVarNew: -7.18e-07
ExplainedVarOld: -9.59e-06
ValFuncLoss: 581


***** Episode 4, Mean Return = 3256.0, Mean Discounted Return = 45.8 *****
ExplainedVarNew: -1.32e-07
ExplainedVarOld: -4.52e-07
ValFuncLoss: 869


***** Episode 5, Mean Return = 3484.0, Mean Discounted Return = 16.3 *****
ExplainedVarNew: -4.85e-08
ExplainedVarOld: -1.21e-07
ValFuncLoss: 945


***** Episode 6, Mean Return = 3110.0, Mean Discounted Return = 14.8 *****
ExplainedVarNew: -3.02e-08
ExplainedVarOld: -6.02e-08
ValFuncLoss: 647


***** Episode 7, Mean Return = 2760.0, Mean Discounted Return = 1

***** Episode 57, Mean Return = 3506.0, Mean Discounted Return = 33.0 *****
ExplainedVarNew: -0.204
ExplainedVarOld: -0.204
ValFuncLoss: 348


***** Episode 58, Mean Return = 3464.0, Mean Discounted Return = 15.5 *****
ExplainedVarNew: -0.141
ExplainedVarOld: -0.173
ValFuncLoss: 391


***** Episode 59, Mean Return = 3450.0, Mean Discounted Return = 13.4 *****
ExplainedVarNew: -0.173
ExplainedVarOld: -0.164
ValFuncLoss: 344


***** Episode 60, Mean Return = 3586.0, Mean Discounted Return = 24.6 *****
ExplainedVarNew: -0.162
ExplainedVarOld: -0.112
ValFuncLoss: 539


***** Episode 61, Mean Return = 3398.0, Mean Discounted Return = 13.0 *****
ExplainedVarNew: -0.165
ExplainedVarOld: -0.202
ValFuncLoss: 425


***** Episode 62, Mean Return = 3124.0, Mean Discounted Return = 36.9 *****
ExplainedVarNew: -0.222
ExplainedVarOld: -0.206
ValFuncLoss: 346


***** Episode 63, Mean Return = 3394.0, Mean Discounted Return = 33.8 *****
ExplainedVarNew: -0.251
ExplainedVarOld: -0.208
ValFuncLoss: 400



***** Episode 115, Mean Return = 3122.0, Mean Discounted Return = 14.9 *****
ExplainedVarNew: -0.309
ExplainedVarOld: -0.284
ValFuncLoss: 401


***** Episode 116, Mean Return = 3910.0, Mean Discounted Return = 25.9 *****
ExplainedVarNew: -0.193
ExplainedVarOld: -0.194
ValFuncLoss: 587


***** Episode 117, Mean Return = 3118.0, Mean Discounted Return = 15.7 *****
ExplainedVarNew: -0.441
ExplainedVarOld: -0.368
ValFuncLoss: 357


***** Episode 118, Mean Return = 3386.0, Mean Discounted Return = 27.6 *****
ExplainedVarNew: -0.296
ExplainedVarOld: -0.295
ValFuncLoss: 478


***** Episode 119, Mean Return = 3432.0, Mean Discounted Return = 27.5 *****
ExplainedVarNew: -0.232
ExplainedVarOld: -0.275
ValFuncLoss: 489


***** Episode 120, Mean Return = 3736.0, Mean Discounted Return = 33.4 *****
ExplainedVarNew: -0.227
ExplainedVarOld: -0.235
ValFuncLoss: 500


***** Episode 121, Mean Return = 3458.0, Mean Discounted Return = 14.5 *****
ExplainedVarNew: -0.306
ExplainedVarOld: -0.28
ValFuncLoss:

***** Episode 173, Mean Return = 3408.0, Mean Discounted Return = 13.1 *****
ExplainedVarNew: -0.27
ExplainedVarOld: -0.342
ValFuncLoss: 435


***** Episode 174, Mean Return = 4028.0, Mean Discounted Return = 46.7 *****
ExplainedVarNew: -0.242
ExplainedVarOld: -0.24
ValFuncLoss: 487


***** Episode 175, Mean Return = 3216.0, Mean Discounted Return = 19.5 *****
ExplainedVarNew: -0.267
ExplainedVarOld: -0.301
ValFuncLoss: 392


***** Episode 176, Mean Return = 3368.0, Mean Discounted Return = 31.8 *****
ExplainedVarNew: -0.338
ExplainedVarOld: -0.302
ValFuncLoss: 366


***** Episode 177, Mean Return = 3422.0, Mean Discounted Return = 56.5 *****
ExplainedVarNew: -0.243
ExplainedVarOld: -0.229
ValFuncLoss: 499


***** Episode 178, Mean Return = 3626.0, Mean Discounted Return = 14.4 *****
ExplainedVarNew: -0.228
ExplainedVarOld: -0.215
ValFuncLoss: 559


***** Episode 179, Mean Return = 3574.0, Mean Discounted Return = 49.6 *****
ExplainedVarNew: -0.221
ExplainedVarOld: -0.205
ValFuncLoss: 