In [1]:
import numpy as np
import gym
import time
import csv
import gym_gridworld
import itertools
from collections import defaultdict

# Move to Yellow Room

In [2]:
env = gym.make("GridWorld-v0")
file_name = "log_files/GridWorld-q-learning/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 50):
    Q = defaultdict(lambda: np.zeros(env.n_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, env.n_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 100)

Episode 0 return -52.0000003 discounted reward -9.9582543
Episode 1 return -52.0000003 discounted reward -9.9582543
Episode 2 return -52.0000003 discounted reward -9.9582543
Episode 3 return -52.0000003 discounted reward -9.9582543
Episode 4 return -52.0000003 discounted reward -9.9582543
Episode 5 return -52.0000003 discounted reward -9.9582543
Episode 6 return -52.0000003 discounted reward -9.9582543
Episode 7 return -52.0000003 discounted reward -9.9582543
Episode 8 return -52.0000003 discounted reward -9.9582543
Episode 9 return -52.0000003 discounted reward -9.9582543
Episode 10 return -52.0000003 discounted reward -9.9582543
Episode 11 return 70.0000003 discounted reward -5.3369733
Episode 12 return -52.0000003 discounted reward -9.9582543
Episode 13 return 78.0000003 discounted reward 0.8324803
Episode 14 return -52.0000003 discounted reward -9.9582543
Episode 15 return 84.0000003 discounted reward 10.3832223
Episode 16 return -52.0000003 discounted reward -9.9582543
Episode 17 

## Human in the Loop 

In [3]:
env = gym.make("GridWorld-v0")
file_name = "log_files/GridWorld-q-learning-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
    
def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 50):
    Q = defaultdict(lambda: np.zeros(env.n_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, env.n_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        # human modifies the Q
        # left red room
        Q[0][2] += 1
        Q[8][2] += 1
        Q[16][2] += 1
        Q[24][1] += 1
        Q[32][0] += 1
        Q[40][0] += 1
        Q[48][0] += 1
        
        Q[1][2] += 1
        Q[9][2] += 1
        Q[17][2] += 1
        Q[25][1] += 1
        Q[33][0] += 1
        Q[41][0] += 1
        Q[49][0] += 1
        
        
        # middle path 
        Q[26][1] += 1
        
        # middle blue room
        Q[3][2] += 1
        Q[11][2] += 1
        Q[19][2] += 1
        Q[27][1] += 1
        Q[35][0] += 1
        Q[43][0] += 1
        Q[51][0] += 1
        
        Q[4][2] += 1
        Q[12][2] += 1
        Q[20][2] += 1
        Q[28][1] += 1
        Q[36][0] += 1
        Q[44][0] += 1
        Q[52][0] += 1
        
        # middle path 
        Q[29][1] += 1
        
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 100)

Episode 0 return -52.0000003 discounted reward -9.9582543
Episode 1 return 83.0000003 discounted reward 8.3449003
Episode 2 return 88.0000003 discounted reward 21.0672493
Episode 3 return 79.0000003 discounted reward 2.0360893
Episode 4 return 74.0000003 discounted reward -2.8928103
Episode 5 return 94.0000003 discounted reward 48.4585103
Episode 6 return 78.0000003 discounted reward 0.8324803
Episode 7 return 77.0000003 discounted reward -0.2507683
Episode 8 return 87.0000003 discounted reward 17.9605243
Episode 9 return 65.0000003 discounted reward -7.2465293
Episode 10 return 93.0000003 discounted reward 42.6126593
Episode 11 return 91.0000003 discounted reward 32.6162543
Episode 12 return 64.0000003 discounted reward -7.5218763
Episode 13 return 88.0000003 discounted reward 21.0672493
Episode 14 return 84.0000003 discounted reward 10.3832223
Episode 15 return 74.0000003 discounted reward -2.8928103
Episode 16 return 85.0000003 discounted reward 12.6480253
Episode 17 return 77.00000

# Chain

In [7]:
env = gym.make("NChain-v0")
num_actions = 2

file_name = "log_files/NChain-q-learning/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 1000):
    Q = defaultdict(lambda: np.zeros(num_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, num_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 200)

Episode 0 return 1618.0000003 discounted reward 20.6467793
Episode 1 return 1664.0000003 discounted reward 13.4598533
Episode 2 return 1822.0000003 discounted reward 18.7914663
Episode 3 return 1760.0000003 discounted reward 15.8270593
Episode 4 return 1444.0000003 discounted reward 9.6674923
Episode 5 return 1866.0000003 discounted reward 15.5843763
Episode 6 return 1874.0000003 discounted reward 16.3389123
Episode 7 return 1994.0000003 discounted reward 9.5106893
Episode 8 return 1784.0000003 discounted reward 8.1861563
Episode 9 return 1986.0000003 discounted reward 11.4517933
Episode 10 return 1736.0000003 discounted reward 7.0121343
Episode 11 return 1786.0000003 discounted reward 10.8719253
Episode 12 return 1712.0000003 discounted reward 20.0499353
Episode 13 return 1700.0000003 discounted reward 21.8366203
Episode 14 return 2078.0000003 discounted reward 14.0458063
Episode 15 return 1734.0000003 discounted reward 17.3067913
Episode 16 return 1882.0000003 discounted reward 9.678

Episode 137 return 1890.0000003 discounted reward 9.7885633
Episode 138 return 1512.0000003 discounted reward 10.0272743
Episode 139 return 1706.0000003 discounted reward 15.5729323
Episode 140 return 1650.0000003 discounted reward 15.6557613
Episode 141 return 1776.0000003 discounted reward 12.2141543
Episode 142 return 2040.0000003 discounted reward 10.6549063
Episode 143 return 1810.0000003 discounted reward 30.6608883
Episode 144 return 1760.0000003 discounted reward 17.8287453
Episode 145 return 2080.0000003 discounted reward 26.1084833
Episode 146 return 2246.0000003 discounted reward 25.3702673
Episode 147 return 1936.0000003 discounted reward 12.5211453
Episode 148 return 2070.0000003 discounted reward 9.3269203
Episode 149 return 1670.0000003 discounted reward 13.6379373
Episode 150 return 1678.0000003 discounted reward 12.0659153
Episode 151 return 1796.0000003 discounted reward 22.2539763
Episode 152 return 2014.0000003 discounted reward 16.2122733
Episode 153 return 1978.00

## Human in the Loop 

In [6]:
env = gym.make("NChain-v0")
num_actions = 2

file_name = "log_files/NChain-q-learning-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 1000):
    Q = defaultdict(lambda: np.zeros(num_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, num_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
            
        # human modifies Q
        Q[0][0] += 0.1
        Q[1][0] += 0.1
        Q[2][0] += 0.1
        Q[3][0] += 0.1
        Q[4][0] += 0.1
        
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 200)

Episode 0 return 1440.0000003 discounted reward 11.4550733
Episode 1 return 1622.0000003 discounted reward 14.2389633
Episode 2 return 1802.0000003 discounted reward 35.0993303
Episode 3 return 2168.0000003 discounted reward 18.7066983
Episode 4 return 1666.0000003 discounted reward 11.8945273
Episode 5 return 1870.0000003 discounted reward 23.1421353
Episode 6 return 1780.0000003 discounted reward 22.3740233
Episode 7 return 1868.0000003 discounted reward 22.6883463
Episode 8 return 1870.0000003 discounted reward 15.8247103
Episode 9 return 1974.0000003 discounted reward 11.4946593
Episode 10 return 1760.0000003 discounted reward 15.2294253
Episode 11 return 1812.0000003 discounted reward 10.8617993
Episode 12 return 1758.0000003 discounted reward 11.0056623
Episode 13 return 1796.0000003 discounted reward 13.5323953
Episode 14 return 1756.0000003 discounted reward 10.1697363
Episode 15 return 1810.0000003 discounted reward 8.9477813
Episode 16 return 1794.0000003 discounted reward 20

Episode 137 return 2026.0000003 discounted reward 10.7710573
Episode 138 return 1712.0000003 discounted reward 15.5175963
Episode 139 return 1820.0000003 discounted reward 11.4959383
Episode 140 return 1666.0000003 discounted reward 17.6749923
Episode 141 return 2002.0000003 discounted reward 16.4980063
Episode 142 return 1768.0000003 discounted reward 11.2939313
Episode 143 return 1928.0000003 discounted reward 29.2109773
Episode 144 return 1834.0000003 discounted reward 18.0576773
Episode 145 return 1886.0000003 discounted reward 17.5897083
Episode 146 return 2026.0000003 discounted reward 33.6529203
Episode 147 return 1664.0000003 discounted reward 27.5632123
Episode 148 return 1736.0000003 discounted reward 7.7182843
Episode 149 return 2108.0000003 discounted reward 10.7945253
Episode 150 return 1992.0000003 discounted reward 7.8252203
Episode 151 return 1722.0000003 discounted reward 18.3134163
Episode 152 return 1604.0000003 discounted reward 16.8708093
Episode 153 return 1586.00