In [3]:
import gym
import gym_gridworld
import time
import csv

# DR TRPO related files
from grid_train_helper import *
from value import NNValueFunction
from utils import Logger
from grid_dr_policy import DRPolicyKL, DRPolicyWass

# 1. Move to Yellow Room

## ODRPO (online)

In [18]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
file_name = "log_files/GridWorld-odrpo/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()
    
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))
all_values = np.zeros(sta_num)
learning_rate = 0.2


eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        value_observe = all_values[observe]
        value_next_observe = all_values[next_observe]
        # update advantage
        all_advantages[observe][action] = all_advantages[observe][action]*(1-learning_rate) + (reward + gamma*value_next_observe - value_observe)*learning_rate
        # update value
        all_values[observe] = all_values[observe]*(1-learning_rate) + (reward + gamma*value_next_observe)*learning_rate
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        # update policy
        if eps >= 5:
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))

Episode 1 return -51.0000003 discounted reward -9.9536163
Episode 2 return -51.0000003 discounted reward -9.9536163
Episode 3 return -51.0000003 discounted reward -9.9536163
Episode 4 return -51.0000003 discounted reward -9.9536163
Episode 5 return -51.0000003 discounted reward -9.9536163
Episode 6 return -51.0000003 discounted reward -9.9536163
Episode 7 return -51.0000003 discounted reward -9.9536163
Episode 8 return -51.0000003 discounted reward -9.9536163
Episode 9 return -51.0000003 discounted reward -9.9536163
Episode 10 return -51.0000003 discounted reward -9.9536163
Episode 11 return -51.0000003 discounted reward -9.9536163
Episode 12 return -51.0000003 discounted reward -9.9536163
Episode 13 return -51.0000003 discounted reward -9.9536163
Episode 14 return -51.0000003 discounted reward -9.9536163
Episode 15 return -51.0000003 discounted reward -9.9536163
Episode 16 return 93.0000003 discounted reward 42.6126593
Episode 17 return 95.0000003 discounted reward 54.9539003
Episode 

## ODRPO + online human interaction 

In [36]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
file_name = "log_files/GridWorld-odrpo-online-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()

sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))
all_values = np.zeros(sta_num)
learning_rate = 0.2

human_recommendation = dict([(0,2),(8,2),(16,2),(24,1),(32,0),(40,0),(48,0),(1,2),(9,2),(17,2),(25,1),(33,0),(41,0),(49,0),\
                            (26,1),(3,2),(11,2),(19,2),(27,1),(35,0),(43,0),(51,0),(4,2),(12,2),(20,2),(28,1),(36,0),(44,0),(52,0),(29,1)])
        
eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        value_observe = all_values[observe]
        value_next_observe = all_values[next_observe]
        # update advantage
        all_advantages[observe][action] = all_advantages[observe][action]*(1-learning_rate) + (reward + gamma*value_next_observe - value_observe)*learning_rate
        # update value
        all_values[observe] = all_values[observe]*(1-learning_rate) + (reward + gamma*value_next_observe)*learning_rate
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        #--------------- Human in the loop --------------- #
        if human_recommendation[observe] != action: 
            all_advantages[observe][action] -= 1
        else:
            all_advantages[observe][action] += 1
        #------------------------------------------------- #
        # update policy
        if eps >= 5:
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))

Episode 1 return -51.0000003 discounted reward -9.9536163
Episode 2 return -51.0000003 discounted reward -9.9536163
Episode 3 return -51.0000003 discounted reward -9.9536163
Episode 4 return -51.0000003 discounted reward -9.9536163
Episode 5 return -51.0000003 discounted reward -9.9536163
Episode 6 return 84.0000003 discounted reward 10.3832223
Episode 7 return 95.0000003 discounted reward 54.9539003
Episode 8 return 95.0000003 discounted reward 54.9539003
Episode 9 return 95.0000003 discounted reward 54.9539003
Episode 10 return 95.0000003 discounted reward 54.9539003
Episode 11 return 95.0000003 discounted reward 54.9539003
Episode 12 return 95.0000003 discounted reward 54.9539003
Episode 13 return 95.0000003 discounted reward 54.9539003
Episode 14 return 95.0000003 discounted reward 54.9539003
Episode 15 return 95.0000003 discounted reward 54.9539003
Episode 16 return 95.0000003 discounted reward 54.9539003
Episode 17 return 95.0000003 discounted reward 54.9539003
Episode 18 return 

## ODRPO + offline human interaction 

In [40]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
file_name = "log_files/GridWorld-odrpo-offline-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()

sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))
all_values = np.zeros(sta_num)
learning_rate = 0.2


eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        value_observe = all_values[observe]
        value_next_observe = all_values[next_observe]
        # update advantage
        all_advantages[observe][action] = all_advantages[observe][action]*(1-learning_rate) + (reward + gamma*value_next_observe - value_observe)*learning_rate 
        # update value
        all_values[observe] = all_values[observe]*(1-learning_rate) + (reward + gamma*value_next_observe)*learning_rate 
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        # update policy
        if eps >= 5:
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
        
    # human modifies the advantage
    # left red room
    all_advantages[0][2] += 1
    all_advantages[8][2] += 1
    all_advantages[16][2] += 1
    all_advantages[24][1] += 1
    all_advantages[32][0] += 1
    all_advantages[40][0] += 1
    all_advantages[48][0] += 1
        
    all_advantages[1][2] += 1
    all_advantages[9][2] += 1
    all_advantages[17][2] += 1
    all_advantages[25][1] += 1
    all_advantages[33][0] += 1
    all_advantages[41][0] += 1
    all_advantages[49][0] += 1
        
        
    # middle path 
    all_advantages[26][1] += 1
        
    # middle blue room
    all_advantages[3][2] += 1
    all_advantages[11][2] += 1
    all_advantages[19][2] += 1
    all_advantages[27][1] += 1
    all_advantages[35][0] += 1
    all_advantages[43][0] += 1
    all_advantages[51][0] += 1
        
    all_advantages[4][2] += 1
    all_advantages[12][2] += 1
    all_advantages[20][2] += 1
    all_advantages[28][1] += 1
    all_advantages[36][0] += 1
    all_advantages[44][0] += 1
    all_advantages[52][0] += 1
        
    # middle path 
    all_advantages[29][1] += 1
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))

Episode 1 return -51.0000003 discounted reward -9.9536163
Episode 2 return -51.0000003 discounted reward -9.9536163
Episode 3 return -51.0000003 discounted reward -9.9536163
Episode 4 return -51.0000003 discounted reward -9.9536163
Episode 5 return -51.0000003 discounted reward -9.9536163
Episode 6 return 93.0000003 discounted reward 42.6126593
Episode 7 return 95.0000003 discounted reward 54.9539003
Episode 8 return 95.0000003 discounted reward 54.9539003
Episode 9 return 95.0000003 discounted reward 54.9539003
Episode 10 return 95.0000003 discounted reward 54.9539003
Episode 11 return 95.0000003 discounted reward 54.9539003
Episode 12 return 95.0000003 discounted reward 54.9539003
Episode 13 return 95.0000003 discounted reward 54.9539003
Episode 14 return 95.0000003 discounted reward 54.9539003
Episode 15 return 95.0000003 discounted reward 54.9539003
Episode 16 return 95.0000003 discounted reward 54.9539003
Episode 17 return 95.0000003 discounted reward 54.9539003
Episode 18 return 

## ODRPO completely by humans

In [12]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
file_name = "log_files/GridWorld-odrpo-complete-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()

sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))

human_recommendation = dict([(0,2),(8,2),(16,2),(24,1),(32,0),(40,0),(48,0),(1,2),(9,2),(17,2),(25,1),(33,0),(41,0),(49,0),\
                            (26,1),(3,2),(11,2),(19,2),(27,1),(35,0),(43,0),(51,0),(4,2),(12,2),(20,2),(28,1),(36,0),(44,0),(52,0),(29,1)])
        
eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        #--------------- Human writes the advantage table --------------- #
        if human_recommendation[observe] != action: 
            all_advantages[observe][action] = -1
        else:
            all_advantages[observe][action] = 1
        #---------------------------------------------------------------- #
        # update policy
        if eps >= 5:
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))

Episode 1 return -51.0000003 discounted reward -9.9536163
Episode 2 return -51.0000003 discounted reward -9.9536163
Episode 3 return -51.0000003 discounted reward -9.9536163
Episode 4 return -51.0000003 discounted reward -9.9536163
Episode 5 return -51.0000003 discounted reward -9.9536163
Episode 6 return 90.0000003 discounted reward 28.3546283
Episode 7 return 95.0000003 discounted reward 54.9539003
Episode 8 return 95.0000003 discounted reward 54.9539003
Episode 9 return 95.0000003 discounted reward 54.9539003
Episode 10 return 95.0000003 discounted reward 54.9539003
Episode 11 return 95.0000003 discounted reward 54.9539003
Episode 12 return 95.0000003 discounted reward 54.9539003
Episode 13 return 95.0000003 discounted reward 54.9539003
Episode 14 return 95.0000003 discounted reward 54.9539003
Episode 15 return 95.0000003 discounted reward 54.9539003
Episode 16 return 95.0000003 discounted reward 54.9539003
Episode 17 return 95.0000003 discounted reward 54.9539003
Episode 18 return 

# Chain

## ODRPO (online)

In [47]:
env_name = 'NChain-v0'
env = gym.make(env_name)
file_name = "log_files/NChain-odrpo/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()

sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 200
batch_eps = 1
max_steps = 1000
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))
all_values = np.zeros(sta_num)
learning_rate = 0.05


eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        value_observe = all_values[observe]
        value_next_observe = all_values[next_observe]
        # update advantage
        all_advantages[observe][action] = all_advantages[observe][action]*(1-learning_rate) + (reward + gamma*value_next_observe - value_observe)*learning_rate 
        # update value
        all_values[observe] = all_values[observe]*(1-learning_rate) + (reward + gamma*value_next_observe)*learning_rate 
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        # update policy
        if eps >= 5:
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))
print(all_advantages)
print(policy.get_policy())

Episode 1 return 1174.0000003 discounted reward 14.0651403
Episode 2 return 1390.0000003 discounted reward 14.2740993
Episode 3 return 1276.0000003 discounted reward 11.6945903
Episode 4 return 1346.0000003 discounted reward 13.7001683
Episode 5 return 1282.0000003 discounted reward 9.5815123
Episode 6 return 1852.0000003 discounted reward 15.9635843
Episode 7 return 1788.0000003 discounted reward 16.7413803
Episode 8 return 1848.0000003 discounted reward 15.1362373
Episode 9 return 1666.0000003 discounted reward 23.4010663
Episode 10 return 1832.0000003 discounted reward 14.1694293
Episode 11 return 2026.0000003 discounted reward 18.0635833
Episode 12 return 1936.0000003 discounted reward 15.1619543
Episode 13 return 2012.0000003 discounted reward 18.9144883
Episode 14 return 2176.0000003 discounted reward 18.5112483
Episode 15 return 1772.0000003 discounted reward 17.4293403
Episode 16 return 2108.0000003 discounted reward 17.4750583
Episode 17 return 1926.0000003 discounted reward 2

Episode 138 return 3078.0000003 discounted reward 30.6741623
Episode 139 return 3348.0000003 discounted reward 17.4507633
Episode 140 return 2898.0000003 discounted reward 47.3585933
Episode 141 return 3200.0000003 discounted reward 20.1274293
Episode 142 return 3200.0000003 discounted reward 18.8372133
Episode 143 return 3104.0000003 discounted reward 19.4762333
Episode 144 return 2918.0000003 discounted reward 32.5015163
Episode 145 return 2652.0000003 discounted reward 28.1397443
Episode 146 return 3106.0000003 discounted reward 15.8942373
Episode 147 return 2904.0000003 discounted reward 21.1143033
Episode 148 return 3150.0000003 discounted reward 17.3923473
Episode 149 return 3194.0000003 discounted reward 27.0967593
Episode 150 return 3218.0000003 discounted reward 17.7627973
Episode 151 return 2894.0000003 discounted reward 19.0300373
Episode 152 return 2856.0000003 discounted reward 15.3206193
Episode 153 return 3244.0000003 discounted reward 38.3822963
Episode 154 return 2942.

## ODRPO + online human interaction

In [34]:
env_name = 'NChain-v0'
env = gym.make(env_name)
file_name = "log_files/NChain-odrpo-online-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()

sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 200
batch_eps = 1
max_steps = 1000
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))
all_values = np.zeros(sta_num)
learning_rate = 0.05

human_recommendation = dict([(0,0),(1,0),(2,0),(3,0),(4,0)])

eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        value_observe = all_values[observe]
        value_next_observe = all_values[next_observe]
        # update advantage
        all_advantages[observe][action] = all_advantages[observe][action]*(1-learning_rate) + (reward + gamma*value_next_observe - value_observe)*learning_rate 
        # update value
        all_values[observe] = all_values[observe]*(1-learning_rate) + (reward + gamma*value_next_observe)*learning_rate 
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        #--------------- Human in the loop --------------- #
        if human_recommendation[observe] != action: 
            all_advantages[observe][action] -= 0.1
        else:
            all_advantages[observe][action] += 0.1
        #------------------------------------------------- #
        # update policy
        if eps >= 5:
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))

Episode 1 return 1212.0000003 discounted reward 12.8603903
Episode 2 return 1318.0000003 discounted reward 11.9965603
Episode 3 return 1362.0000003 discounted reward 14.3577553
Episode 4 return 1252.0000003 discounted reward 10.5205883
Episode 5 return 1234.0000003 discounted reward 10.7337473
Episode 6 return 3826.0000003 discounted reward 23.3161283
Episode 7 return 3876.0000003 discounted reward 21.7598833
Episode 8 return 3372.0000003 discounted reward 25.1596503
Episode 9 return 3376.0000003 discounted reward 54.1895283
Episode 10 return 3992.0000003 discounted reward 23.2416023
Episode 11 return 3792.0000003 discounted reward 18.1045753
Episode 12 return 4268.0000003 discounted reward 13.1236903
Episode 13 return 3400.0000003 discounted reward 31.7882443
Episode 14 return 3308.0000003 discounted reward 31.0397953
Episode 15 return 4102.0000003 discounted reward 37.8649703
Episode 16 return 3492.0000003 discounted reward 39.2364363
Episode 17 return 3540.0000003 discounted reward 

Episode 139 return 3824.0000003 discounted reward 17.7559953
Episode 140 return 3276.0000003 discounted reward 16.9213863
Episode 141 return 3760.0000003 discounted reward 34.7076023
Episode 142 return 3290.0000003 discounted reward 36.9568823
Episode 143 return 3668.0000003 discounted reward 27.5550393
Episode 144 return 3322.0000003 discounted reward 30.0297283
Episode 145 return 3706.0000003 discounted reward 36.9556713
Episode 146 return 3520.0000003 discounted reward 15.6516063
Episode 147 return 3530.0000003 discounted reward 40.8245253
Episode 148 return 3248.0000003 discounted reward 30.2846963
Episode 149 return 4174.0000003 discounted reward 47.1490103
Episode 150 return 3934.0000003 discounted reward 41.5141533
Episode 151 return 3658.0000003 discounted reward 36.2740463
Episode 152 return 3962.0000003 discounted reward 25.8577293
Episode 153 return 4348.0000003 discounted reward 49.1279943
Episode 154 return 3690.0000003 discounted reward 10.6732593
Episode 155 return 3112.

## ODRPO + offline human interaction

In [50]:
env_name = 'NChain-v0'
env = gym.make(env_name)
file_name = "log_files/NChain-odrpo-offline-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()

sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 200
batch_eps = 1
max_steps = 1000
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))
all_values = np.zeros(sta_num)
learning_rate = 0.05


eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        value_observe = all_values[observe]
        value_next_observe = all_values[next_observe]
        # update advantage
        all_advantages[observe][action] = all_advantages[observe][action]*(1-learning_rate) + (reward + gamma*value_next_observe - value_observe)*learning_rate 
        # update value
        all_values[observe] = all_values[observe]*(1-learning_rate) + (reward + gamma*value_next_observe)*learning_rate 
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        # update policy
        if eps >= 5:
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
        
    # human modifies the advantage
    all_advantages[0][0] += 0.1
    all_advantages[1][0] += 0.1
    all_advantages[2][0] += 0.1
    all_advantages[3][0] += 0.1
    all_advantages[4][0] += 0.1
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))

Episode 1 return 1190.0000003 discounted reward 11.4357993
Episode 2 return 1466.0000003 discounted reward 13.0208193
Episode 3 return 1322.0000003 discounted reward 8.5814073
Episode 4 return 1260.0000003 discounted reward 9.3045713
Episode 5 return 1394.0000003 discounted reward 11.8486923
Episode 6 return 2194.0000003 discounted reward 16.5420083
Episode 7 return 2138.0000003 discounted reward 16.0481603
Episode 8 return 2032.0000003 discounted reward 13.4210633
Episode 9 return 3170.0000003 discounted reward 21.3629753
Episode 10 return 3208.0000003 discounted reward 37.8912963
Episode 11 return 3214.0000003 discounted reward 21.7396103
Episode 12 return 3010.0000003 discounted reward 22.0163173
Episode 13 return 3126.0000003 discounted reward 21.6623123
Episode 14 return 3412.0000003 discounted reward 16.8538013
Episode 15 return 3846.0000003 discounted reward 19.0900683
Episode 16 return 3646.0000003 discounted reward 23.5003203
Episode 17 return 3582.0000003 discounted reward 42

Episode 138 return 3374.0000003 discounted reward 18.1566443
Episode 139 return 3562.0000003 discounted reward 62.8372213
Episode 140 return 3384.0000003 discounted reward 29.4159933
Episode 141 return 3634.0000003 discounted reward 24.2437293
Episode 142 return 3884.0000003 discounted reward 21.0375873
Episode 143 return 3668.0000003 discounted reward 14.6371293
Episode 144 return 3598.0000003 discounted reward 28.8574253
Episode 145 return 3246.0000003 discounted reward 36.7610593
Episode 146 return 3884.0000003 discounted reward 41.7773043
Episode 147 return 3460.0000003 discounted reward 20.3706863
Episode 148 return 3460.0000003 discounted reward 22.0905703
Episode 149 return 3108.0000003 discounted reward 41.3440603
Episode 150 return 3532.0000003 discounted reward 28.9042753
Episode 151 return 3224.0000003 discounted reward 26.5687573
Episode 152 return 3578.0000003 discounted reward 12.9340383
Episode 153 return 3780.0000003 discounted reward 10.0720233
Episode 154 return 3522.

## ODRPO completely by humans

In [11]:
env_name = 'NChain-v0'
env = gym.make(env_name)
file_name = "log_files/NChain-odrpo-complete-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()

sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 200
batch_eps = 1
max_steps = 1000
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))

human_recommendation = dict([(0,0),(1,0),(2,0),(3,0),(4,0)])

eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        #--------------- Human writes the advantage table --------------- #
        if human_recommendation[observe] != action: 
            all_advantages[observe][action] = -1
        else:
            all_advantages[observe][action] = 1
        #---------------------------------------------------------------- #
        # update policy
        if eps >= 5:
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))

Episode 1 return 1174.0000003 discounted reward 12.3564113
Episode 2 return 1318.0000003 discounted reward 9.3407113
Episode 3 return 1302.0000003 discounted reward 9.6858933
Episode 4 return 1374.0000003 discounted reward 10.7497673
Episode 5 return 1318.0000003 discounted reward 12.4484383
Episode 6 return 4000.0000003 discounted reward 15.8747443
Episode 7 return 3512.0000003 discounted reward 35.2320963
Episode 8 return 3488.0000003 discounted reward 7.1453843
Episode 9 return 3594.0000003 discounted reward 34.0022393
Episode 10 return 3544.0000003 discounted reward 25.5213083
Episode 11 return 3832.0000003 discounted reward 44.2063843
Episode 12 return 3734.0000003 discounted reward 10.3228143
Episode 13 return 3452.0000003 discounted reward 15.8702733
Episode 14 return 3658.0000003 discounted reward 20.4800013
Episode 15 return 3774.0000003 discounted reward 26.5611483
Episode 16 return 3622.0000003 discounted reward 20.5375033
Episode 17 return 3590.0000003 discounted reward 11.

Episode 138 return 3800.0000003 discounted reward 37.0756603
Episode 139 return 3850.0000003 discounted reward 16.5871643
Episode 140 return 3862.0000003 discounted reward 42.4268363
Episode 141 return 3658.0000003 discounted reward 28.1892513
Episode 142 return 3432.0000003 discounted reward 28.2277193
Episode 143 return 3574.0000003 discounted reward 30.4255743
Episode 144 return 4192.0000003 discounted reward 28.2385793
Episode 145 return 4102.0000003 discounted reward 9.1894113
Episode 146 return 3976.0000003 discounted reward 17.6148213
Episode 147 return 4130.0000003 discounted reward 38.9676013
Episode 148 return 4358.0000003 discounted reward 48.1688823
Episode 149 return 3102.0000003 discounted reward 21.8094103
Episode 150 return 4380.0000003 discounted reward 19.2220083
Episode 151 return 4020.0000003 discounted reward 27.1384263
Episode 152 return 3814.0000003 discounted reward 32.7201143
Episode 153 return 3820.0000003 discounted reward 30.2226743
Episode 154 return 4208.0