In [2]:
import gym
import gym_gridworld
from datetime import datetime

# DR TRPO related files
from grid_train_helper import *
from value import NNValueFunction
from utils import Logger
from grid_dr_policy import DRPolicyKL, DRPolicyWass

# ODRPO - KL

## No human 

In [6]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 50
batch_eps = 1
max_steps = 10 # max steps per episode
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 1, Mean Return = -47.0, Mean Discounted Return = -27.5 *****
ExplainedVarNew: -0.00483
ExplainedVarOld: -0.00206
ValFuncLoss: 437


***** Episode 2, Mean Return = -27.0, Mean Discounted Return = -16.0 *****
ExplainedVarNew: -1.63e-08
ExplainedVarOld: -4.93e-06
ValFuncLoss: 107


***** Episode 3, Mean Return = -43.0, Mean Discounted Return = -23.5 *****
ExplainedVarNew: -5.27e-10
ExplainedVarOld: -2.91e-09
ValFuncLoss: 318


***** Episode 4, Mean Return = -23.0, Mean Discounted Return = -15.0 *****
ExplainedVarNew: -5.33e-08
ExplainedVarOld: -2.51e-07
ValFuncLoss: 43.6


***** Episode 5, Mean Return = -15.0, Mean Discounted Return = -9.2 *****
ExplainedVarNew: -1.05e-09
ExplainedVarOld: -1.78e-09
ValFuncLoss: 13


***** Episode 6, Mean Return = -11.0, Mean Discounted Return = -6.9 *****
ExplainedVarNew: -6.64e-08
ExplainedVarOld: -7.77e-08
ValFuncLoss: 3.47


***** Episode 7, Mean Return = -27.0, Mean Discounted Return = -1

## Human in the Loop

In [4]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 50
batch_eps = 1
max_steps = 10 # max steps per episode
logger = Logger(logname=env_name + '_DR-KL_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        
        # human modifies the advantage
        # left red room
        all_advantages[0][2] += 1
        all_advantages[5][2] += 1
        all_advantages[10][1] += 1
        all_advantages[15][0] += 1
        all_advantages[20][0] += 1
        
        all_advantages[11][1] += 1
        
        # middle blue room
        all_advantages[2][2] += 1
        all_advantages[7][2] += 1
        all_advantages[12][1] += 1
        all_advantages[17][0] += 1
        all_advantages[23][0] += 1
        
        all_advantages[13][1] += 1
        
        
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 1, Mean Return = -35.0, Mean Discounted Return = -19.8 *****
ExplainedVarNew: -0.000117
ExplainedVarOld: -3.69e-05
ValFuncLoss: 262


***** Episode 2, Mean Return = -19.0, Mean Discounted Return = -12.0 *****
ExplainedVarNew: -9.8e-07
ExplainedVarOld: -0.00023
ValFuncLoss: 41.2


***** Episode 3, Mean Return = 85.0, Mean Discounted Return = 36.5 *****
ExplainedVarNew: -3.95e-07
ExplainedVarOld: -3.22e-08
ValFuncLoss: 4.61e+03


***** Episode 4, Mean Return = 97.0, Mean Discounted Return = 70.2 *****
ExplainedVarNew: -2.05e-08
ExplainedVarOld: -3.18e-08
ValFuncLoss: 6.84e+03


***** Episode 5, Mean Return = 97.0, Mean Discounted Return = 70.2 *****
ExplainedVarNew: -3.06e-09
ExplainedVarOld: -2.05e-08
ValFuncLoss: 6.67e+03


***** Episode 6, Mean Return = 97.0, Mean Discounted Return = 70.2 *****
ExplainedVarNew: -7.83e-11
ExplainedVarOld: -3.06e-09
ValFuncLoss: 6.55e+03


***** Episode 7, Mean Return = 97.0, Mean Discounte

# ODRPO - Wasserstein

## No human 

In [5]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 50
batch_eps = 1
max_steps = 10 # max steps per episode
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 1, Mean Return = -31.0, Mean Discounted Return = -18.2 *****
ExplainedVarNew: -0.000947
ExplainedVarOld: -8.09e-05
ValFuncLoss: 210


***** Episode 2, Mean Return = -27.0, Mean Discounted Return = -15.5 *****
ExplainedVarNew: -3.25e-07
ExplainedVarOld: -2.03e-07
ValFuncLoss: 127


***** Episode 3, Mean Return = -31.0, Mean Discounted Return = -17.3 *****
ExplainedVarNew: -1.04e-08
ExplainedVarOld: -3.96e-08
ValFuncLoss: 153


***** Episode 4, Mean Return = -31.0, Mean Discounted Return = -18.8 *****
ExplainedVarNew: -1.55e-07
ExplainedVarOld: -2.07e-06
ValFuncLoss: 144


***** Episode 5, Mean Return = -11.0, Mean Discounted Return = -6.9 *****
ExplainedVarNew: -2.24e-09
ExplainedVarOld: -3.15e-09
ValFuncLoss: 3.49


***** Episode 6, Mean Return = -11.0, Mean Discounted Return = -6.9 *****
ExplainedVarNew: -1.89e-09
ExplainedVarOld: -2.24e-09
ValFuncLoss: 3.43


***** Episode 7, Mean Return = -11.0, Mean Discounted Return =

## Human in the Loop 

In [7]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
val_func = NNValueFunction(1, 10)
gamma = 0.9
lam = 1
total_eps = 50
batch_eps = 1
max_steps = 10 # max steps per episode
logger = Logger(logname=env_name + '_DR-Wass_Batch=' + str(batch_eps), now=datetime.utcnow().strftime("%b-%d_%H:%M:%S"))


eps = 0
while eps < total_eps:
        trajectories = run_policy(env, policy, batch_eps, max_steps, logger)
        eps += len(trajectories)
        # add estimated values to episodes
        add_value(trajectories, val_func)  
        # calculated discounted sum of Rs
        add_disc_sum_rew(trajectories, gamma, logger)  
        # calculate advantage
        add_gae(trajectories, gamma, lam)  
        # concatenate all episodes into single NumPy arrays
        observes, actions, advantages, disc_sum_rew = build_train_set(trajectories)
        disc_freqs = find_disc_freqs(trajectories, sta_num, gamma)
        log_batch_stats(observes, actions, advantages, disc_sum_rew, eps, logger)
        all_advantages = policy.preprocess_adv(observes, actions, advantages)
        
        # human modifies the advantage
        # left red room
        all_advantages[0][2] += 1
        all_advantages[5][2] += 1
        all_advantages[10][1] += 1
        all_advantages[15][0] += 1
        all_advantages[20][0] += 1
        
        all_advantages[11][1] += 1
        
        # middle blue room
        all_advantages[2][2] += 1
        all_advantages[7][2] += 1
        all_advantages[12][1] += 1
        all_advantages[17][0] += 1
        all_advantages[23][0] += 1
        
        all_advantages[13][1] += 1
        
        policy.update(observes, actions, all_advantages, disc_freqs, env_name, eps)
        val_func.fit(observes, disc_sum_rew, logger)
        # write logger results to file and stdout
        logger.write(display=True) 
logger.close()

Value Params -- h1: 10, h2: 7, h3: 5, lr: 0.00378
***** Episode 1, Mean Return = -19.0, Mean Discounted Return = -12.6 *****
ExplainedVarNew: -8.03e-07
ExplainedVarOld: -1.09e-05
ValFuncLoss: 44.2


***** Episode 2, Mean Return = -11.0, Mean Discounted Return = -6.9 *****
ExplainedVarNew: -2.78e-07
ExplainedVarOld: -3.01e-05
ValFuncLoss: 6.92


***** Episode 3, Mean Return = 97.0, Mean Discounted Return = 70.2 *****
ExplainedVarNew: -2.17e-08
ExplainedVarOld: -1.63e-10
ValFuncLoss: 7.11e+03


***** Episode 4, Mean Return = 97.0, Mean Discounted Return = 70.2 *****
ExplainedVarNew: -2.32e-09
ExplainedVarOld: -2.17e-08
ValFuncLoss: 6.79e+03


***** Episode 5, Mean Return = 97.0, Mean Discounted Return = 70.2 *****
ExplainedVarNew: -6.81e-11
ExplainedVarOld: -2.32e-09
ValFuncLoss: 6.63e+03


***** Episode 6, Mean Return = 97.0, Mean Discounted Return = 70.2 *****
ExplainedVarNew: -4.72e-12
ExplainedVarOld: -6.81e-11
ValFuncLoss: 6.51e+03


***** Episode 7, Mean Return = 97.0, Mean Discoun