In [2]:
import numpy as np
import gym
import time
import csv
import gym_gridworld
import itertools
from collections import defaultdict

# 1. Move to Yellow Room

## Q-learning (online) 

In [10]:
env = gym.make("GridWorld-v0")
file_name = "log_files/GridWorld-q-learning/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 50):
    Q = defaultdict(lambda: np.zeros(env.n_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, env.n_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 100)

Episode 0 return -52.0000003 discounted reward -9.9582543
Episode 1 return -52.0000003 discounted reward -9.9582543
Episode 2 return -52.0000003 discounted reward -9.9582543
Episode 3 return -52.0000003 discounted reward -9.9582543
Episode 4 return -52.0000003 discounted reward -9.9582543
Episode 5 return 73.0000003 discounted reward -3.6035293
Episode 6 return -52.0000003 discounted reward -9.9582543
Episode 7 return -52.0000003 discounted reward -9.9582543
Episode 8 return -52.0000003 discounted reward -9.9582543
Episode 9 return -52.0000003 discounted reward -9.9582543
Episode 10 return -52.0000003 discounted reward -9.9582543
Episode 11 return -52.0000003 discounted reward -9.9582543
Episode 12 return 50.0000003 discounted reward -9.4330853
Episode 13 return -52.0000003 discounted reward -9.9582543
Episode 14 return -52.0000003 discounted reward -9.9582543
Episode 15 return -52.0000003 discounted reward -9.9582543
Episode 16 return 88.0000003 discounted reward 21.0672493
Episode 17

## Q-learning (online) + online human interaction

In [25]:
env = gym.make("GridWorld-v0")
file_name = "log_files/GridWorld-q-learning-online-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])

human_recommendation = dict([(0,2),(8,2),(16,2),(24,1),(32,0),(40,0),(48,0),(1,2),(9,2),(17,2),(25,1),(33,0),(41,0),(49,0),\
                            (26,1),(3,2),(11,2),(19,2),(27,1),(35,0),(43,0),(51,0),(4,2),(12,2),(20,2),(28,1),(36,0),(44,0),(52,0),(29,1)])

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 50):
    Q = defaultdict(lambda: np.zeros(env.n_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, env.n_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            
            #--------------- Human in the loop --------------- #
            if human_recommendation[observation] != a: 
                Q[observation][a] -= 1
            else:
                Q[observation][a] += 1
            #------------------------------------------------- #
            
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 100)

Episode 0 return -52.0000003 discounted reward -9.9582543
Episode 1 return 85.0000003 discounted reward 12.6480253
Episode 2 return 73.0000003 discounted reward -3.6035293
Episode 3 return 85.0000003 discounted reward 12.6480253
Episode 4 return 82.0000003 discounted reward 6.5104103
Episode 5 return 90.0000003 discounted reward 28.3546283
Episode 6 return 70.0000003 discounted reward -5.3369733
Episode 7 return 95.0000003 discounted reward 54.9539003
Episode 8 return 88.0000003 discounted reward 21.0672493
Episode 9 return 92.0000003 discounted reward 37.3513933
Episode 10 return 93.0000003 discounted reward 42.6126593
Episode 11 return 92.0000003 discounted reward 37.3513933
Episode 12 return 92.0000003 discounted reward 37.3513933
Episode 13 return 85.0000003 discounted reward 12.6480253
Episode 14 return 79.0000003 discounted reward 2.0360893
Episode 15 return 90.0000003 discounted reward 28.3546283
Episode 16 return 95.0000003 discounted reward 54.9539003
Episode 17 return 92.0000

## Q-learning (online) + offline human interaction

In [14]:
env = gym.make("GridWorld-v0")
file_name = "log_files/GridWorld-q-learning-offline-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
    
def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 50):
    Q = defaultdict(lambda: np.zeros(env.n_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, env.n_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
            
        # human modifies the Q
        # left red room
        Q[0][2] += 1
        Q[8][2] += 1
        Q[16][2] += 1
        Q[24][1] += 1
        Q[32][0] += 1
        Q[40][0] += 1
        Q[48][0] += 1
        
        Q[1][2] += 1
        Q[9][2] += 1
        Q[17][2] += 1
        Q[25][1] += 1
        Q[33][0] += 1
        Q[41][0] += 1
        Q[49][0] += 1
        
        
        # middle path 
        Q[26][1] += 1
        
        # middle blue room
        Q[3][2] += 1
        Q[11][2] += 1
        Q[19][2] += 1
        Q[27][1] += 1
        Q[35][0] += 1
        Q[43][0] += 1
        Q[51][0] += 1
        
        Q[4][2] += 1
        Q[12][2] += 1
        Q[20][2] += 1
        Q[28][1] += 1
        Q[36][0] += 1
        Q[44][0] += 1
        Q[52][0] += 1
        
        # middle path 
        Q[29][1] += 1
        
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 100)

Episode 0 return -52.0000003 discounted reward -9.9582543
Episode 1 return 85.0000003 discounted reward 12.6480253
Episode 2 return 76.0000003 discounted reward -1.2256913
Episode 3 return 92.0000003 discounted reward 37.3513933
Episode 4 return 88.0000003 discounted reward 21.0672493
Episode 5 return 95.0000003 discounted reward 54.9539003
Episode 6 return 86.0000003 discounted reward 15.1644723
Episode 7 return 95.0000003 discounted reward 54.9539003
Episode 8 return 87.0000003 discounted reward 17.9605243
Episode 9 return 90.0000003 discounted reward 28.3546283
Episode 10 return 92.0000003 discounted reward 37.3513933
Episode 11 return 82.0000003 discounted reward 6.5104103
Episode 12 return 93.0000003 discounted reward 42.6126593
Episode 13 return 91.0000003 discounted reward 32.6162543
Episode 14 return 93.0000003 discounted reward 42.6126593
Episode 15 return 92.0000003 discounted reward 37.3513933
Episode 16 return 92.0000003 discounted reward 37.3513933
Episode 17 return 93.000

# 2. Chain

## Q-learning (online) 

In [12]:
env = gym.make("NChain-v0")
num_actions = 2

file_name = "log_files/NChain-q-learning/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 1000):
    Q = defaultdict(lambda: np.zeros(num_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, num_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 200)

Episode 0 return 1450.0000003 discounted reward 17.0373573
Episode 1 return 1704.0000003 discounted reward 11.6211963
Episode 2 return 1860.0000003 discounted reward 14.5074633
Episode 3 return 2046.0000003 discounted reward 22.2174263
Episode 4 return 2054.0000003 discounted reward 13.5237953
Episode 5 return 1906.0000003 discounted reward 7.9870883
Episode 6 return 1738.0000003 discounted reward 27.2632553
Episode 7 return 1802.0000003 discounted reward 12.9249403
Episode 8 return 1816.0000003 discounted reward 13.2728243
Episode 9 return 1776.0000003 discounted reward 8.7690083
Episode 10 return 1668.0000003 discounted reward 12.9908573
Episode 11 return 1838.0000003 discounted reward 13.5494533
Episode 12 return 2064.0000003 discounted reward 13.5653653
Episode 13 return 1780.0000003 discounted reward 12.8955713
Episode 14 return 1792.0000003 discounted reward 15.7660853
Episode 15 return 1958.0000003 discounted reward 22.8282343
Episode 16 return 1884.0000003 discounted reward 12.

Episode 138 return 1812.0000003 discounted reward 11.3481933
Episode 139 return 1794.0000003 discounted reward 8.1444703
Episode 140 return 2220.0000003 discounted reward 20.7823313
Episode 141 return 1548.0000003 discounted reward 22.4362303
Episode 142 return 1916.0000003 discounted reward 13.3935003
Episode 143 return 1652.0000003 discounted reward 8.6088413
Episode 144 return 1956.0000003 discounted reward 9.5957353
Episode 145 return 1914.0000003 discounted reward 9.7891223
Episode 146 return 1810.0000003 discounted reward 14.2324513
Episode 147 return 1954.0000003 discounted reward 11.6108353
Episode 148 return 1800.0000003 discounted reward 15.5099233
Episode 149 return 1998.0000003 discounted reward 9.4545103
Episode 150 return 1720.0000003 discounted reward 14.7760373
Episode 151 return 1718.0000003 discounted reward 10.0530383
Episode 152 return 1664.0000003 discounted reward 19.7354063
Episode 153 return 1750.0000003 discounted reward 11.7021343
Episode 154 return 1548.00000

## Q-learning (online) + online human interaction

In [22]:
env = gym.make("NChain-v0")
num_actions = 2

file_name = "log_files/NChain-q-learning-online-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])

human_recommendation = dict([(0,0),(1,0),(2,0),(3,0),(4,0)])

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 1000):
    Q = defaultdict(lambda: np.zeros(num_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, num_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            
            #--------------- Human in the loop --------------- #
            if human_recommendation[observation] != a: 
                Q[observation][a] -= 1
            else:
                Q[observation][a] += 1
            #------------------------------------------------- #
            
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 200)

Episode 0 return 1906.0000003 discounted reward 12.2633733
Episode 1 return 2192.0000003 discounted reward 12.9228853
Episode 2 return 1750.0000003 discounted reward 24.7576773
Episode 3 return 1658.0000003 discounted reward 7.4807083
Episode 4 return 1864.0000003 discounted reward 7.5849263
Episode 5 return 1874.0000003 discounted reward 19.5005083
Episode 6 return 2058.0000003 discounted reward 12.7689953
Episode 7 return 1810.0000003 discounted reward 13.4749363
Episode 8 return 1836.0000003 discounted reward 25.0055803
Episode 9 return 1838.0000003 discounted reward 14.5670813
Episode 10 return 1966.0000003 discounted reward 11.5246503
Episode 11 return 1742.0000003 discounted reward 10.3355293
Episode 12 return 1672.0000003 discounted reward 13.7137913
Episode 13 return 1854.0000003 discounted reward 11.4427023
Episode 14 return 1798.0000003 discounted reward 12.3605373
Episode 15 return 1758.0000003 discounted reward 8.5150763
Episode 16 return 1902.0000003 discounted reward 28.2

Episode 141 return 1764.0000003 discounted reward 15.1960123
Episode 142 return 2210.0000003 discounted reward 19.1975293
Episode 143 return 1932.0000003 discounted reward 7.2800243
Episode 144 return 1736.0000003 discounted reward 13.9374083
Episode 145 return 2040.0000003 discounted reward 10.1708773
Episode 146 return 1754.0000003 discounted reward 17.2994073
Episode 147 return 1718.0000003 discounted reward 26.1705923
Episode 148 return 1962.0000003 discounted reward 14.4382453
Episode 149 return 1630.0000003 discounted reward 10.9506553
Episode 150 return 1848.0000003 discounted reward 9.4881003
Episode 151 return 2036.0000003 discounted reward 12.2491993
Episode 152 return 1748.0000003 discounted reward 15.8822463
Episode 153 return 2022.0000003 discounted reward 26.4821923
Episode 154 return 1698.0000003 discounted reward 12.1306833
Episode 155 return 1966.0000003 discounted reward 36.1853913
Episode 156 return 1912.0000003 discounted reward 9.4485943
Episode 157 return 1536.000

## Q-learning (online) + offline human interaction

In [9]:
env = gym.make("NChain-v0")
num_actions = 2

file_name = "log_files/NChain-q-learning-offline-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 1000):
    Q = defaultdict(lambda: np.zeros(num_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, num_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
            
        # human modifies Q
        Q[0][0] += 0.1
        Q[1][0] += 0.1
        Q[2][0] += 0.1
        Q[3][0] += 0.1
        Q[4][0] += 0.1
        
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 200)

Episode 0 return 1454.0000003 discounted reward 11.1680643
Episode 1 return 1686.0000003 discounted reward 22.0647653
Episode 2 return 1998.0000003 discounted reward 16.1563243
Episode 3 return 1684.0000003 discounted reward 10.0673613
Episode 4 return 1716.0000003 discounted reward 7.4343733
Episode 5 return 1782.0000003 discounted reward 22.2644223
Episode 6 return 1920.0000003 discounted reward 21.1456623
Episode 7 return 2094.0000003 discounted reward 13.8939823
Episode 8 return 2122.0000003 discounted reward 22.9519753
Episode 9 return 1550.0000003 discounted reward 7.0853743
Episode 10 return 1848.0000003 discounted reward 24.8896833
Episode 11 return 1862.0000003 discounted reward 15.5312153
Episode 12 return 1486.0000003 discounted reward 13.9285403
Episode 13 return 1756.0000003 discounted reward 29.1171583
Episode 14 return 2044.0000003 discounted reward 13.3825243
Episode 15 return 1994.0000003 discounted reward 12.3265723
Episode 16 return 1772.0000003 discounted reward 10.

Episode 138 return 1936.0000003 discounted reward 14.7488063
Episode 139 return 1882.0000003 discounted reward 7.4128723
Episode 140 return 1886.0000003 discounted reward 11.4244173
Episode 141 return 1854.0000003 discounted reward 12.4375043
Episode 142 return 1906.0000003 discounted reward 17.1917513
Episode 143 return 1816.0000003 discounted reward 8.0876653
Episode 144 return 1734.0000003 discounted reward 10.0271463
Episode 145 return 1940.0000003 discounted reward 19.6891583
Episode 146 return 2000.0000003 discounted reward 31.1867233
Episode 147 return 1588.0000003 discounted reward 11.5051633
Episode 148 return 1748.0000003 discounted reward 17.8314133
Episode 149 return 1918.0000003 discounted reward 6.2882423
Episode 150 return 1794.0000003 discounted reward 14.6163673
Episode 151 return 1976.0000003 discounted reward 27.6193553
Episode 152 return 2054.0000003 discounted reward 9.2033843
Episode 153 return 1548.0000003 discounted reward 14.8948303
Episode 154 return 1870.0000