In [2]:
import gym
import gym_gridworld
from datetime import datetime

# DR TRPO related files
from grid_train_helper import *
from value import NNValueFunction
from utils import Logger
from grid_dr_policy import DRPolicyKL, DRPolicyWass

# 1. Move to Yellow Room

## ODRPO KL (batch)

In [4]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        policy.update(all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.0016
ExplainedVarOld: -0.00478
ValFuncLoss: 61.7


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00262
ExplainedVarOld: -0.00255
ValFuncLoss: 51.6


***** Episode 3, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.0048
ExplainedVarOld: -0.00836
ValFuncLoss: 44.3


***** Episode 4, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -2.92e-08
ExplainedVarOld: -4.89e-08
ValFuncLoss: 33.6


***** Episode 5, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -7.24e-08
ExplainedVarOld: -5.8e-08
ValFuncLoss: 27


***** Episode 6, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.000331
ExplainedVarOld: -0.000309
ValFuncLoss: 21.9


***** Episode 7, Mean Return = -51.0, Mean Discounted Return = -10.0

***** Episode 58, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.000144
ExplainedVarOld: -9.25e-05
ValFuncLoss: 5.27


***** Episode 59, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.000252
ExplainedVarOld: -0.000144
ValFuncLoss: 5.27


***** Episode 60, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.000805
ExplainedVarOld: -0.000711
ValFuncLoss: 5.28


***** Episode 61, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.000584
ExplainedVarOld: -0.000919
ValFuncLoss: 5.28


***** Episode 62, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -2.5e-07
ExplainedVarOld: -3.13e-07
ValFuncLoss: 5.27


***** Episode 63, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -5.25e-07
ExplainedVarOld: -5.54e-07
ValFuncLoss: 5.27


***** Episode 64, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -2.81e-0

## ODRPO KL (batch) + offline human interaction 

In [5]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-KL-Human_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
         
        # human modifies the advantage
        # left red room
        all_advantages[0][2] += 1
        all_advantages[8][2] += 1
        all_advantages[16][2] += 1
        all_advantages[24][1] += 1
        all_advantages[32][0] += 1
        all_advantages[40][0] += 1
        all_advantages[48][0] += 1
        
        all_advantages[1][2] += 1
        all_advantages[9][2] += 1
        all_advantages[17][2] += 1
        all_advantages[25][1] += 1
        all_advantages[33][0] += 1
        all_advantages[41][0] += 1
        all_advantages[49][0] += 1
        
        
        # middle path 
        all_advantages[26][1] += 1
        
        # middle blue room
        all_advantages[3][2] += 1
        all_advantages[11][2] += 1
        all_advantages[19][2] += 1
        all_advantages[27][1] += 1
        all_advantages[35][0] += 1
        all_advantages[43][0] += 1
        all_advantages[51][0] += 1
        
        all_advantages[4][2] += 1
        all_advantages[12][2] += 1
        all_advantages[20][2] += 1
        all_advantages[28][1] += 1
        all_advantages[36][0] += 1
        all_advantages[44][0] += 1
        all_advantages[52][0] += 1
        
        # middle path 
        all_advantages[29][1] += 1
        
        
        policy.update(all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.61e-06
ExplainedVarOld: -9.64e-08
ValFuncLoss: 60.9


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00501
ExplainedVarOld: -0.00349
ValFuncLoss: 48.2


***** Episode 3, Mean Return = 78.0, Mean Discounted Return = 0.8 *****
ExplainedVarNew: -4.16e-11
ExplainedVarOld: -9.79e-10
ValFuncLoss: 2.06e+03


***** Episode 4, Mean Return = 81.0, Mean Discounted Return = 4.9 *****
ExplainedVarNew: -2.39e-11
ExplainedVarOld: -3.42e-11
ValFuncLoss: 2.18e+03


***** Episode 5, Mean Return = 84.0, Mean Discounted Return = 10.4 *****
ExplainedVarNew: -8.33e-14
ExplainedVarOld: -4.81e-11
ValFuncLoss: 2.41e+03


***** Episode 6, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.3e-13
ExplainedVarOld: -2.42e-14
ValFuncLoss: 5.27e+03


***** Episode 7, Mean Return = 91.0, Mean Discounted R

***** Episode 58, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.53e-14
ExplainedVarOld: 2.22e-16
ValFuncLoss: 1.77e+03


***** Episode 59, Mean Return = 93.0, Mean Discounted Return = 42.6 *****
ExplainedVarNew: -4.66e-15
ExplainedVarOld: -2e-14
ValFuncLoss: 1.32e+03


***** Episode 60, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.51e-14
ExplainedVarOld: 2.22e-16
ValFuncLoss: 1.69e+03


***** Episode 61, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.33e-14
ExplainedVarOld: -1.51e-14
ValFuncLoss: 1.66e+03


***** Episode 62, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.44e-15
ExplainedVarOld: -1.33e-14
ValFuncLoss: 1.63e+03


***** Episode 63, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.33e-14
ExplainedVarOld: -8.44e-15
ValFuncLoss: 1.59e+03


***** Episode 64, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: 2.

## ODRPO Wasserstein (batch)

In [6]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        policy.update(all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.5e-06
ExplainedVarOld: -2.98e-05
ValFuncLoss: 63.5


***** Episode 2, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -7.92e-08
ExplainedVarOld: -4.38e-06
ValFuncLoss: 55.4


***** Episode 3, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.36e-08
ExplainedVarOld: -2.43e-08
ValFuncLoss: 49.1


***** Episode 4, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.09e-08
ExplainedVarOld: -3.89e-08
ValFuncLoss: 42.5


***** Episode 5, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -1.47e-09
ExplainedVarOld: -1.1e-08
ValFuncLoss: 36.2


***** Episode 6, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -9.59e-08
ExplainedVarOld: -3.08e-07
ValFuncLoss: 30.6


***** Episode 7, Mean Return = -51.0, Mean Discounted Retur

***** Episode 66, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -4.31e-14
ExplainedVarOld: 2.22e-16
ValFuncLoss: 5.27


***** Episode 67, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -2.19e-11
ExplainedVarOld: -2.11e-11
ValFuncLoss: 5.27


***** Episode 68, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: 2.22e-16
ValFuncLoss: 5.27


***** Episode 69, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -5.27e-11
ExplainedVarOld: -5.27e-11
ValFuncLoss: 5.27


***** Episode 70, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: 2.22e-16
ExplainedVarOld: 2.22e-16
ValFuncLoss: 5.27


***** Episode 71, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -2.15e-11
ExplainedVarOld: -2.11e-11
ValFuncLoss: 5.27


***** Episode 72, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: 2.22e-16
Exp

## ODRPO Wasserstein (batch) + offline human interaction

In [7]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 5)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
logger = Logger(logname=env_name + '_DR-Wass-Human_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        
        # human modifies the advantage
        # left red room
        all_advantages[0][2] += 1
        all_advantages[8][2] += 1
        all_advantages[16][2] += 1
        all_advantages[24][1] += 1
        all_advantages[32][0] += 1
        all_advantages[40][0] += 1
        all_advantages[48][0] += 1
        
        all_advantages[1][2] += 1
        all_advantages[9][2] += 1
        all_advantages[17][2] += 1
        all_advantages[25][1] += 1
        all_advantages[33][0] += 1
        all_advantages[41][0] += 1
        all_advantages[49][0] += 1
        
        
        # middle path 
        all_advantages[26][1] += 1
        
        # middle blue room
        all_advantages[3][2] += 1
        all_advantages[11][2] += 1
        all_advantages[19][2] += 1
        all_advantages[27][1] += 1
        all_advantages[35][0] += 1
        all_advantages[43][0] += 1
        all_advantages[51][0] += 1
        
        all_advantages[4][2] += 1
        all_advantages[12][2] += 1
        all_advantages[20][2] += 1
        all_advantages[28][1] += 1
        all_advantages[36][0] += 1
        all_advantages[44][0] += 1
        all_advantages[52][0] += 1
        
        # middle path 
        all_advantages[29][1] += 1
        
        policy.update(all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 5, h2: 5, h3: 5, lr: 0.00447
***** Episode 1, Mean Return = -51.0, Mean Discounted Return = -10.0 *****
ExplainedVarNew: -0.00657
ExplainedVarOld: -0.00837
ValFuncLoss: 58.6


***** Episode 2, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -7.77e-06
ExplainedVarOld: -2.13e-06
ValFuncLoss: 5.89e+03


***** Episode 3, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -3.23e-10
ExplainedVarOld: -7.77e-06
ValFuncLoss: 5.47e+03


***** Episode 4, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -9.66e-12
ExplainedVarOld: -3.23e-10
ValFuncLoss: 5.26e+03


***** Episode 5, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -2.87e-12
ExplainedVarOld: -9.66e-12
ValFuncLoss: 5.11e+03


***** Episode 6, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -1.57e-12
ExplainedVarOld: -2.87e-12
ValFuncLoss: 4.98e+03


***** Episode 7, Mean Return = 95.0, Mean Discoun

***** Episode 61, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.44e-15
ExplainedVarOld: -8.44e-15
ValFuncLoss: 1.85e+03


***** Episode 62, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -4.06e-14
ExplainedVarOld: -8.44e-15
ValFuncLoss: 1.81e+03


***** Episode 63, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -2.89e-14
ExplainedVarOld: -4.06e-14
ValFuncLoss: 1.78e+03


***** Episode 64, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -2.89e-14
ExplainedVarOld: -2.89e-14
ValFuncLoss: 1.75e+03


***** Episode 65, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -8.44e-15
ExplainedVarOld: -2.89e-14
ValFuncLoss: 1.72e+03


***** Episode 66, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNew: -3.38e-14
ExplainedVarOld: -8.44e-15
ValFuncLoss: 1.68e+03


***** Episode 67, Mean Return = 95.0, Mean Discounted Return = 55.0 *****
ExplainedVarNe

# Chain

## ODRPO (batch)

In [8]:
env_name = 'NChain-v0'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 200
batch_eps = 1
max_steps = 1000
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        policy.update(all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 1, Mean Return = 1262.0, Mean Discounted Return = 12.5 *****
ExplainedVarNew: -0.00313
ExplainedVarOld: -0.000494
ValFuncLoss: 136


***** Episode 2, Mean Return = 2522.0, Mean Discounted Return = 8.0 *****
ExplainedVarNew: -3.03e-06
ExplainedVarOld: -0.000257
ValFuncLoss: 705


***** Episode 3, Mean Return = 2856.0, Mean Discounted Return = 15.5 *****
ExplainedVarNew: -3.8e-07
ExplainedVarOld: -2.94e-06
ValFuncLoss: 761


***** Episode 4, Mean Return = 2850.0, Mean Discounted Return = 9.4 *****
ExplainedVarNew: -1.11e-07
ExplainedVarOld: -3.84e-07
ValFuncLoss: 679


***** Episode 5, Mean Return = 3192.0, Mean Discounted Return = 14.7 *****
ExplainedVarNew: -3.96e-08
ExplainedVarOld: -9.2e-08
ValFuncLoss: 786


***** Episode 6, Mean Return = 3056.0, Mean Discounted Return = 48.5 *****
ExplainedVarNew: -2.32e-08
ExplainedVarOld: -4.23e-08
ValFuncLoss: 644


***** Episode 7, Mean Return = 3086.0, Mean Discounted Return = 23.

***** Episode 57, Mean Return = 2908.0, Mean Discounted Return = 19.4 *****
ExplainedVarNew: -0.236
ExplainedVarOld: -0.228
ValFuncLoss: 331


***** Episode 58, Mean Return = 3036.0, Mean Discounted Return = 28.9 *****
ExplainedVarNew: -0.298
ExplainedVarOld: -0.236
ValFuncLoss: 344


***** Episode 59, Mean Return = 3088.0, Mean Discounted Return = 39.7 *****
ExplainedVarNew: -0.284
ExplainedVarOld: -0.328
ValFuncLoss: 312


***** Episode 60, Mean Return = 3058.0, Mean Discounted Return = 14.6 *****
ExplainedVarNew: -0.205
ExplainedVarOld: -0.2
ValFuncLoss: 407


***** Episode 61, Mean Return = 2964.0, Mean Discounted Return = 27.4 *****
ExplainedVarNew: -0.228
ExplainedVarOld: -0.179
ValFuncLoss: 464


***** Episode 62, Mean Return = 2768.0, Mean Discounted Return = 16.2 *****
ExplainedVarNew: -0.385
ExplainedVarOld: -0.387
ValFuncLoss: 307


***** Episode 63, Mean Return = 3020.0, Mean Discounted Return = 17.2 *****
ExplainedVarNew: -0.215
ExplainedVarOld: -0.274
ValFuncLoss: 391


*

***** Episode 115, Mean Return = 2868.0, Mean Discounted Return = 11.6 *****
ExplainedVarNew: -0.256
ExplainedVarOld: -0.291
ValFuncLoss: 389


***** Episode 116, Mean Return = 2902.0, Mean Discounted Return = 16.7 *****
ExplainedVarNew: -0.366
ExplainedVarOld: -0.272
ValFuncLoss: 399


***** Episode 117, Mean Return = 3686.0, Mean Discounted Return = 29.6 *****
ExplainedVarNew: -0.223
ExplainedVarOld: -0.293
ValFuncLoss: 512


***** Episode 118, Mean Return = 3108.0, Mean Discounted Return = 13.0 *****
ExplainedVarNew: -0.335
ExplainedVarOld: -0.317
ValFuncLoss: 360


***** Episode 119, Mean Return = 3010.0, Mean Discounted Return = 12.3 *****
ExplainedVarNew: -0.322
ExplainedVarOld: -0.283
ValFuncLoss: 416


***** Episode 120, Mean Return = 2854.0, Mean Discounted Return = 16.4 *****
ExplainedVarNew: -0.331
ExplainedVarOld: -0.368
ValFuncLoss: 358


***** Episode 121, Mean Return = 2770.0, Mean Discounted Return = 14.1 *****
ExplainedVarNew: -0.319
ExplainedVarOld: -0.374
ValFuncLoss

***** Episode 173, Mean Return = 3052.0, Mean Discounted Return = 44.2 *****
ExplainedVarNew: -0.211
ExplainedVarOld: -0.247
ValFuncLoss: 371


***** Episode 174, Mean Return = 2942.0, Mean Discounted Return = 11.4 *****
ExplainedVarNew: -0.336
ExplainedVarOld: -0.235
ValFuncLoss: 358


***** Episode 175, Mean Return = 2802.0, Mean Discounted Return = 28.1 *****
ExplainedVarNew: -0.328
ExplainedVarOld: -0.339
ValFuncLoss: 348


***** Episode 176, Mean Return = 2768.0, Mean Discounted Return = 28.9 *****
ExplainedVarNew: -0.337
ExplainedVarOld: -0.286
ValFuncLoss: 396


***** Episode 177, Mean Return = 3226.0, Mean Discounted Return = 36.2 *****
ExplainedVarNew: -0.356
ExplainedVarOld: -0.337
ValFuncLoss: 442


***** Episode 178, Mean Return = 2558.0, Mean Discounted Return = 17.8 *****
ExplainedVarNew: -0.438
ExplainedVarOld: -0.418
ValFuncLoss: 356


***** Episode 179, Mean Return = 3630.0, Mean Discounted Return = 28.3 *****
ExplainedVarNew: -0.275
ExplainedVarOld: -0.313
ValFuncLoss

## ODRPO (batch) + offline human interaction

In [9]:
env_name = 'NChain-v0'
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 200
batch_eps = 1
max_steps = 1000
logger = Logger(logname=env_name + '_DR-Wass-Human_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        
        # human modifies the advantage
        all_advantages[0][0] += 0.1
        all_advantages[1][0] += 0.1
        all_advantages[2][0] += 0.1
        all_advantages[3][0] += 0.1
        all_advantages[4][0] += 0.1
        
        policy.update(all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 1, Mean Return = 1284.0, Mean Discounted Return = 9.6 *****
ExplainedVarNew: -0.000972
ExplainedVarOld: -0.00168
ValFuncLoss: 146


***** Episode 2, Mean Return = 2428.0, Mean Discounted Return = 17.8 *****
ExplainedVarNew: -2.89e-06
ExplainedVarOld: -0.00012
ValFuncLoss: 659


***** Episode 3, Mean Return = 2738.0, Mean Discounted Return = 18.5 *****
ExplainedVarNew: -2.26e-07
ExplainedVarOld: -2.92e-06
ValFuncLoss: 691


***** Episode 4, Mean Return = 3202.0, Mean Discounted Return = 13.1 *****
ExplainedVarNew: -3.46e-08
ExplainedVarOld: -1.56e-07
ValFuncLoss: 925


***** Episode 5, Mean Return = 2852.0, Mean Discounted Return = 16.5 *****
ExplainedVarNew: -1.78e-08
ExplainedVarOld: -4.26e-08
ValFuncLoss: 628


***** Episode 6, Mean Return = 2946.0, Mean Discounted Return = 13.9 *****
ExplainedVarNew: -8.59e-09
ExplainedVarOld: -1.59e-08
ValFuncLoss: 654


***** Episode 7, Mean Return = 2986.0, Mean Discounted Return = 1

***** Episode 57, Mean Return = 3716.0, Mean Discounted Return = 16.8 *****
ExplainedVarNew: -0.148
ExplainedVarOld: -0.138
ValFuncLoss: 557


***** Episode 58, Mean Return = 3362.0, Mean Discounted Return = 17.9 *****
ExplainedVarNew: -0.238
ExplainedVarOld: -0.201
ValFuncLoss: 423


***** Episode 59, Mean Return = 2542.0, Mean Discounted Return = 15.3 *****
ExplainedVarNew: -0.291
ExplainedVarOld: -0.317
ValFuncLoss: 320


***** Episode 60, Mean Return = 1450.0, Mean Discounted Return = 11.9 *****
ExplainedVarNew: -0.57
ExplainedVarOld: -0.441
ValFuncLoss: 114


***** Episode 61, Mean Return = 1504.0, Mean Discounted Return = 13.5 *****
ExplainedVarNew: -0.188
ExplainedVarOld: -0.519
ValFuncLoss: 87.6


***** Episode 62, Mean Return = 3180.0, Mean Discounted Return = 31.4 *****
ExplainedVarNew: -0.311
ExplainedVarOld: -0.113
ValFuncLoss: 511


***** Episode 63, Mean Return = 3322.0, Mean Discounted Return = 10.6 *****
ExplainedVarNew: -0.234
ExplainedVarOld: -0.351
ValFuncLoss: 412



***** Episode 115, Mean Return = 2996.0, Mean Discounted Return = 26.7 *****
ExplainedVarNew: -0.235
ExplainedVarOld: -0.233
ValFuncLoss: 381


***** Episode 116, Mean Return = 3088.0, Mean Discounted Return = 33.7 *****
ExplainedVarNew: -0.328
ExplainedVarOld: -0.249
ValFuncLoss: 385


***** Episode 117, Mean Return = 3236.0, Mean Discounted Return = 31.5 *****
ExplainedVarNew: -0.216
ExplainedVarOld: -0.299
ValFuncLoss: 391


***** Episode 118, Mean Return = 3534.0, Mean Discounted Return = 9.3 *****
ExplainedVarNew: -0.241
ExplainedVarOld: -0.221
ValFuncLoss: 403


***** Episode 119, Mean Return = 3242.0, Mean Discounted Return = 18.4 *****
ExplainedVarNew: -0.22
ExplainedVarOld: -0.227
ValFuncLoss: 405


***** Episode 120, Mean Return = 3436.0, Mean Discounted Return = 17.9 *****
ExplainedVarNew: -0.245
ExplainedVarOld: -0.19
ValFuncLoss: 490


***** Episode 121, Mean Return = 3948.0, Mean Discounted Return = 22.2 *****
ExplainedVarNew: -0.17
ExplainedVarOld: -0.263
ValFuncLoss: 44

***** Episode 173, Mean Return = 3422.0, Mean Discounted Return = 30.0 *****
ExplainedVarNew: -0.251
ExplainedVarOld: -0.194
ValFuncLoss: 476


***** Episode 174, Mean Return = 3648.0, Mean Discounted Return = 17.3 *****
ExplainedVarNew: -0.223
ExplainedVarOld: -0.243
ValFuncLoss: 476


***** Episode 175, Mean Return = 3422.0, Mean Discounted Return = 17.6 *****
ExplainedVarNew: -0.249
ExplainedVarOld: -0.256
ValFuncLoss: 415


***** Episode 176, Mean Return = 3594.0, Mean Discounted Return = 19.8 *****
ExplainedVarNew: -0.238
ExplainedVarOld: -0.258
ValFuncLoss: 403


***** Episode 177, Mean Return = 3944.0, Mean Discounted Return = 21.2 *****
ExplainedVarNew: -0.195
ExplainedVarOld: -0.223
ValFuncLoss: 425


***** Episode 178, Mean Return = 4000.0, Mean Discounted Return = 15.5 *****
ExplainedVarNew: -0.175
ExplainedVarOld: -0.156
ValFuncLoss: 511


***** Episode 179, Mean Return = 3586.0, Mean Discounted Return = 19.5 *****
ExplainedVarNew: -0.236
ExplainedVarOld: -0.229
ValFuncLoss