In [2]:
import gym
import gym_gridworld
import time
import csv

# DR TRPO related files
from grid_train_helper import *
from value import NNValueFunction
from utils import Logger
from grid_dr_policy import DRPolicyKL, DRPolicyWass

# 1. Move to Yellow Room

## ODRPO KL (online)

In [73]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
file_name = "log_files/GridWorld-odrpo/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()
    
sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))
all_values = np.zeros(sta_num)
learning_rate = 0.2


eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        value_observe = all_values[observe]
        value_next_observe = all_values[next_observe]
        # update advantage
        all_advantages[observe][action] = all_advantages[observe][action]*(1-learning_rate) + (reward + gamma*value_next_observe - value_observe)*learning_rate
        # update value
        all_values[observe] = all_values[observe]*(1-learning_rate) + (reward + gamma*value_next_observe)*learning_rate
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        # update policy
        if eps >= 5:
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))

Episode 1 return -51.0000003 discounted reward -9.9536163
Episode 2 return -51.0000003 discounted reward -9.9536163
Episode 3 return -51.0000003 discounted reward -9.9536163
Episode 4 return -51.0000003 discounted reward -9.9536163
Episode 5 return -51.0000003 discounted reward -9.9536163
Episode 6 return -51.0000003 discounted reward -9.9536163
Episode 7 return -51.0000003 discounted reward -9.9536163
Episode 8 return -51.0000003 discounted reward -9.9536163
Episode 9 return -51.0000003 discounted reward -9.9536163
Episode 10 return 73.0000003 discounted reward -3.6035293
Episode 11 return -51.0000003 discounted reward -9.9536163
Episode 12 return 91.0000003 discounted reward 32.6162543
Episode 13 return 95.0000003 discounted reward 54.9539003
Episode 14 return 93.0000003 discounted reward 42.6126593
Episode 15 return 95.0000003 discounted reward 54.9539003
Episode 16 return 95.0000003 discounted reward 54.9539003
Episode 17 return 95.0000003 discounted reward 54.9539003
Episode 18 re

## ODRPO KL (online) + online human interaction 

In [55]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
file_name = "log_files/GridWorld-odrpo-online-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()

sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))
all_values = np.zeros(sta_num)
learning_rate = 0.2

human_recommendation = dict([(0,2),(8,2),(16,2),(24,1),(32,0),(40,0),(48,0),(1,2),(9,2),(17,2),(25,1),(33,0),(41,0),(49,0),\
                            (26,1),(3,2),(11,2),(19,2),(27,1),(35,0),(43,0),(51,0),(4,2),(12,2),(20,2),(28,1),(36,0),(44,0),(52,0),(29,1)])
        
eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        value_observe = all_values[observe]
        value_next_observe = all_values[next_observe]
        # update advantage
        all_advantages[observe][action] = all_advantages[observe][action]*(1-learning_rate) + (reward + gamma*value_next_observe - value_observe)*learning_rate
        # update value
        all_values[observe] = all_values[observe]*(1-learning_rate) + (reward + gamma*value_next_observe)*learning_rate
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        # update policy
        if eps >= 5:
            #--------------- Human in the loop --------------- #
            if human_recommendation[observe] != action: 
                all_advantages[observe][action] -= 1
            else:
                all_advantages[observe][action] += 1
            #------------------------------------------------- #
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))

Episode 1 return -51.0000003 discounted reward -9.9536163
Episode 2 return 61.0000003 discounted reward -8.1934483
Episode 3 return -51.0000003 discounted reward -9.9536163
Episode 4 return -51.0000003 discounted reward -9.9536163
Episode 5 return -51.0000003 discounted reward -9.9536163
Episode 6 return -51.0000003 discounted reward -9.9536163
Episode 7 return 77.0000003 discounted reward -0.2507683
Episode 8 return 95.0000003 discounted reward 54.9539003
Episode 9 return 95.0000003 discounted reward 54.9539003
Episode 10 return 95.0000003 discounted reward 54.9539003
Episode 11 return 95.0000003 discounted reward 54.9539003
Episode 12 return 95.0000003 discounted reward 54.9539003
Episode 13 return 95.0000003 discounted reward 54.9539003
Episode 14 return 95.0000003 discounted reward 54.9539003
Episode 15 return 95.0000003 discounted reward 54.9539003
Episode 16 return 95.0000003 discounted reward 54.9539003
Episode 17 return 95.0000003 discounted reward 54.9539003
Episode 18 return 

## ODRPO KL (online) + offline human interaction 

In [57]:
env_name = "GridWorld-v0"
env = gym.make(env_name)
file_name = "log_files/GridWorld-odrpo-offline-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()

sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyKL(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 100
batch_eps = 1
max_steps = 50 # max steps per episode
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))
all_values = np.zeros(sta_num)
learning_rate = 0.2


eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        value_observe = all_values[observe]
        value_next_observe = all_values[next_observe]
        # update advantage
        all_advantages[observe][action] = all_advantages[observe][action]*(1-learning_rate) + (reward + gamma*value_next_observe - value_observe)*learning_rate 
        # update value
        all_values[observe] = all_values[observe]*(1-learning_rate) + (reward + gamma*value_next_observe)*learning_rate 
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        # update policy
        if eps >= 5:
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
        
    # human modifies the advantage
    # left red room
    all_advantages[0][2] += 1
    all_advantages[8][2] += 1
    all_advantages[16][2] += 1
    all_advantages[24][1] += 1
    all_advantages[32][0] += 1
    all_advantages[40][0] += 1
    all_advantages[48][0] += 1
        
    all_advantages[1][2] += 1
    all_advantages[9][2] += 1
    all_advantages[17][2] += 1
    all_advantages[25][1] += 1
    all_advantages[33][0] += 1
    all_advantages[41][0] += 1
    all_advantages[49][0] += 1
        
        
    # middle path 
    all_advantages[26][1] += 1
        
    # middle blue room
    all_advantages[3][2] += 1
    all_advantages[11][2] += 1
    all_advantages[19][2] += 1
    all_advantages[27][1] += 1
    all_advantages[35][0] += 1
    all_advantages[43][0] += 1
    all_advantages[51][0] += 1
        
    all_advantages[4][2] += 1
    all_advantages[12][2] += 1
    all_advantages[20][2] += 1
    all_advantages[28][1] += 1
    all_advantages[36][0] += 1
    all_advantages[44][0] += 1
    all_advantages[52][0] += 1
        
    # middle path 
    all_advantages[29][1] += 1
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))

Episode 1 return -51.0000003 discounted reward -9.9536163
Episode 2 return -51.0000003 discounted reward -9.9536163
Episode 3 return -51.0000003 discounted reward -9.9536163
Episode 4 return -51.0000003 discounted reward -9.9536163
Episode 5 return -51.0000003 discounted reward -9.9536163
Episode 6 return 92.0000003 discounted reward 37.3513933
Episode 7 return 95.0000003 discounted reward 54.9539003
Episode 8 return 95.0000003 discounted reward 54.9539003
Episode 9 return 95.0000003 discounted reward 54.9539003
Episode 10 return 95.0000003 discounted reward 54.9539003
Episode 11 return 95.0000003 discounted reward 54.9539003
Episode 12 return 95.0000003 discounted reward 54.9539003
Episode 13 return 95.0000003 discounted reward 54.9539003
Episode 14 return 95.0000003 discounted reward 54.9539003
Episode 15 return 95.0000003 discounted reward 54.9539003
Episode 16 return 95.0000003 discounted reward 54.9539003
Episode 17 return 95.0000003 discounted reward 54.9539003
Episode 18 return 

# Chain

## ODRPO (online)

In [47]:
env_name = 'NChain-v0'
env = gym.make(env_name)
file_name = "log_files/NChain-odrpo/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()

sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 200
batch_eps = 1
max_steps = 1000
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))
all_values = np.zeros(sta_num)
learning_rate = 0.05


eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        value_observe = all_values[observe]
        value_next_observe = all_values[next_observe]
        # update advantage
        all_advantages[observe][action] = all_advantages[observe][action]*(1-learning_rate) + (reward + gamma*value_next_observe - value_observe)*learning_rate 
        # update value
        all_values[observe] = all_values[observe]*(1-learning_rate) + (reward + gamma*value_next_observe)*learning_rate 
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        # update policy
        if eps >= 5:
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))
print(all_advantages)
print(policy.get_policy())

Episode 1 return 1174.0000003 discounted reward 14.0651403
Episode 2 return 1390.0000003 discounted reward 14.2740993
Episode 3 return 1276.0000003 discounted reward 11.6945903
Episode 4 return 1346.0000003 discounted reward 13.7001683
Episode 5 return 1282.0000003 discounted reward 9.5815123
Episode 6 return 1852.0000003 discounted reward 15.9635843
Episode 7 return 1788.0000003 discounted reward 16.7413803
Episode 8 return 1848.0000003 discounted reward 15.1362373
Episode 9 return 1666.0000003 discounted reward 23.4010663
Episode 10 return 1832.0000003 discounted reward 14.1694293
Episode 11 return 2026.0000003 discounted reward 18.0635833
Episode 12 return 1936.0000003 discounted reward 15.1619543
Episode 13 return 2012.0000003 discounted reward 18.9144883
Episode 14 return 2176.0000003 discounted reward 18.5112483
Episode 15 return 1772.0000003 discounted reward 17.4293403
Episode 16 return 2108.0000003 discounted reward 17.4750583
Episode 17 return 1926.0000003 discounted reward 2

Episode 138 return 3078.0000003 discounted reward 30.6741623
Episode 139 return 3348.0000003 discounted reward 17.4507633
Episode 140 return 2898.0000003 discounted reward 47.3585933
Episode 141 return 3200.0000003 discounted reward 20.1274293
Episode 142 return 3200.0000003 discounted reward 18.8372133
Episode 143 return 3104.0000003 discounted reward 19.4762333
Episode 144 return 2918.0000003 discounted reward 32.5015163
Episode 145 return 2652.0000003 discounted reward 28.1397443
Episode 146 return 3106.0000003 discounted reward 15.8942373
Episode 147 return 2904.0000003 discounted reward 21.1143033
Episode 148 return 3150.0000003 discounted reward 17.3923473
Episode 149 return 3194.0000003 discounted reward 27.0967593
Episode 150 return 3218.0000003 discounted reward 17.7627973
Episode 151 return 2894.0000003 discounted reward 19.0300373
Episode 152 return 2856.0000003 discounted reward 15.3206193
Episode 153 return 3244.0000003 discounted reward 38.3822963
Episode 154 return 2942.

## ODRPO (online) + online human interaction

In [51]:
env_name = 'NChain-v0'
env = gym.make(env_name)
file_name = "log_files/NChain-odrpo-online-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()

sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 200
batch_eps = 1
max_steps = 1000
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))
all_values = np.zeros(sta_num)
learning_rate = 0.05

human_recommendation = dict([(0,0),(1,0),(2,0),(3,0),(4,0)])

eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        value_observe = all_values[observe]
        value_next_observe = all_values[next_observe]
        # update advantage
        all_advantages[observe][action] = all_advantages[observe][action]*(1-learning_rate) + (reward + gamma*value_next_observe - value_observe)*learning_rate 
        # update value
        all_values[observe] = all_values[observe]*(1-learning_rate) + (reward + gamma*value_next_observe)*learning_rate 
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        # update policy
        if eps >= 5:
            #--------------- Human in the loop --------------- #
            if human_recommendation[observe] != action: 
                all_advantages[observe][action] -= 0.1
            else:
                all_advantages[observe][action] += 0.1
            #------------------------------------------------- #
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))

Episode 1 return 1482.0000003 discounted reward 13.2704933
Episode 2 return 1176.0000003 discounted reward 9.6053123
Episode 3 return 1390.0000003 discounted reward 15.0094063
Episode 4 return 1268.0000003 discounted reward 12.1881123
Episode 5 return 1436.0000003 discounted reward 13.7401513
Episode 6 return 3460.0000003 discounted reward 42.4595313
Episode 7 return 4004.0000003 discounted reward 14.2123713
Episode 8 return 3416.0000003 discounted reward 33.5322783
Episode 9 return 3618.0000003 discounted reward 29.3253823
Episode 10 return 3290.0000003 discounted reward 23.5436953
Episode 11 return 3904.0000003 discounted reward 11.3509533
Episode 12 return 3586.0000003 discounted reward 8.0207883
Episode 13 return 3222.0000003 discounted reward 15.7648853
Episode 14 return 3662.0000003 discounted reward 24.0952193
Episode 15 return 3386.0000003 discounted reward 11.6965453
Episode 16 return 3556.0000003 discounted reward 10.0271093
Episode 17 return 3414.0000003 discounted reward 32

Episode 138 return 4274.0000003 discounted reward 56.8401143
Episode 139 return 3894.0000003 discounted reward 63.9801873
Episode 140 return 3376.0000003 discounted reward 13.9344653
Episode 141 return 3100.0000003 discounted reward 11.8269763
Episode 142 return 4016.0000003 discounted reward 17.6920093
Episode 143 return 3126.0000003 discounted reward 11.2599633
Episode 144 return 4050.0000003 discounted reward 8.7559493
Episode 145 return 3732.0000003 discounted reward 27.1779753
Episode 146 return 3636.0000003 discounted reward 54.3151963
Episode 147 return 3472.0000003 discounted reward 40.2058363
Episode 148 return 3438.0000003 discounted reward 24.4493723
Episode 149 return 3212.0000003 discounted reward 22.9611953
Episode 150 return 3458.0000003 discounted reward 13.6181903
Episode 151 return 3716.0000003 discounted reward 19.5413353
Episode 152 return 3798.0000003 discounted reward 34.3956563
Episode 153 return 3456.0000003 discounted reward 21.4407373
Episode 154 return 3252.0

## ODRPO (online) + offline human interaction

In [50]:
env_name = 'NChain-v0'
env = gym.make(env_name)
file_name = "log_files/NChain-odrpo-offline-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
start = time.time()

sta_num = env.observation_space.n
act_num = env.action_space.n
policy = DRPolicyWass(sta_num, act_num)
gamma = 0.9
lam = 1
total_eps = 200
batch_eps = 1
max_steps = 1000
all_advantages = []
for i in range(sta_num):
    all_advantages.append(np.zeros(act_num))
all_values = np.zeros(sta_num)
learning_rate = 0.05


eps = 0
while eps < total_eps:
    steps = 0
    obs = env.reset()
    total_reward = 0
    discounted_reward = 0
    done = False
    while steps <= max_steps and not done: 
        observe, action, reward, next_observe, done = run_step(env, obs, policy)
        value_observe = all_values[observe]
        value_next_observe = all_values[next_observe]
        # update advantage
        all_advantages[observe][action] = all_advantages[observe][action]*(1-learning_rate) + (reward + gamma*value_next_observe - value_observe)*learning_rate 
        # update value
        all_values[observe] = all_values[observe]*(1-learning_rate) + (reward + gamma*value_next_observe)*learning_rate 
        # calculate total and discounted rewards
        total_reward += reward
        discounted_reward += (gamma**steps)*reward
        # update policy
        if eps >= 5:
            policy.update(all_advantages, env_name)
        steps += 1
        obs = next_observe
    eps += 1
    runtime = time.time() - start
    with open(file_name, 'a') as outfile:
        writer = csv.writer(outfile, delimiter=",")
        writer.writerow((str(total_reward), str(eps), str(runtime)))
        
    # human modifies the advantage
    all_advantages[0][0] += 0.1
    all_advantages[1][0] += 0.1
    all_advantages[2][0] += 0.1
    all_advantages[3][0] += 0.1
    all_advantages[4][0] += 0.1
    print('Episode %d return %f3 discounted reward %f3' %(eps, total_reward, discounted_reward))

Episode 1 return 1190.0000003 discounted reward 11.4357993
Episode 2 return 1466.0000003 discounted reward 13.0208193
Episode 3 return 1322.0000003 discounted reward 8.5814073
Episode 4 return 1260.0000003 discounted reward 9.3045713
Episode 5 return 1394.0000003 discounted reward 11.8486923
Episode 6 return 2194.0000003 discounted reward 16.5420083
Episode 7 return 2138.0000003 discounted reward 16.0481603
Episode 8 return 2032.0000003 discounted reward 13.4210633
Episode 9 return 3170.0000003 discounted reward 21.3629753
Episode 10 return 3208.0000003 discounted reward 37.8912963
Episode 11 return 3214.0000003 discounted reward 21.7396103
Episode 12 return 3010.0000003 discounted reward 22.0163173
Episode 13 return 3126.0000003 discounted reward 21.6623123
Episode 14 return 3412.0000003 discounted reward 16.8538013
Episode 15 return 3846.0000003 discounted reward 19.0900683
Episode 16 return 3646.0000003 discounted reward 23.5003203
Episode 17 return 3582.0000003 discounted reward 42

Episode 138 return 3374.0000003 discounted reward 18.1566443
Episode 139 return 3562.0000003 discounted reward 62.8372213
Episode 140 return 3384.0000003 discounted reward 29.4159933
Episode 141 return 3634.0000003 discounted reward 24.2437293
Episode 142 return 3884.0000003 discounted reward 21.0375873
Episode 143 return 3668.0000003 discounted reward 14.6371293
Episode 144 return 3598.0000003 discounted reward 28.8574253
Episode 145 return 3246.0000003 discounted reward 36.7610593
Episode 146 return 3884.0000003 discounted reward 41.7773043
Episode 147 return 3460.0000003 discounted reward 20.3706863
Episode 148 return 3460.0000003 discounted reward 22.0905703
Episode 149 return 3108.0000003 discounted reward 41.3440603
Episode 150 return 3532.0000003 discounted reward 28.9042753
Episode 151 return 3224.0000003 discounted reward 26.5687573
Episode 152 return 3578.0000003 discounted reward 12.9340383
Episode 153 return 3780.0000003 discounted reward 10.0720233
Episode 154 return 3522.