In [1]:
import numpy as np
import gym
import time
import csv
import gym_gridworld
import itertools
from collections import defaultdict

# Move to Yellow Room

In [2]:
env = gym.make("GridWorld-v0")
file_name = "log_files/GridWorld-q-learning/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 50):
    Q = defaultdict(lambda: np.zeros(env.n_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, env.n_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 100)

Episode 0 return -52.0000003 discounted reward -9.9582543
Episode 1 return -52.0000003 discounted reward -9.9582543
Episode 2 return -52.0000003 discounted reward -9.9582543
Episode 3 return -52.0000003 discounted reward -9.9582543
Episode 4 return -52.0000003 discounted reward -9.9582543
Episode 5 return -52.0000003 discounted reward -9.9582543
Episode 6 return -52.0000003 discounted reward -9.9582543
Episode 7 return -52.0000003 discounted reward -9.9582543
Episode 8 return -52.0000003 discounted reward -9.9582543
Episode 9 return -52.0000003 discounted reward -9.9582543
Episode 10 return -52.0000003 discounted reward -9.9582543
Episode 11 return 70.0000003 discounted reward -5.3369733
Episode 12 return -52.0000003 discounted reward -9.9582543
Episode 13 return 78.0000003 discounted reward 0.8324803
Episode 14 return -52.0000003 discounted reward -9.9582543
Episode 15 return 84.0000003 discounted reward 10.3832223
Episode 16 return -52.0000003 discounted reward -9.9582543
Episode 17 

## Human in the Loop 

In [3]:
env = gym.make("GridWorld-v0")
file_name = "log_files/GridWorld-q-learning-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
    
def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 50):
    Q = defaultdict(lambda: np.zeros(env.n_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, env.n_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        # human modifies the Q
        # left red room
        Q[0][2] += 1
        Q[8][2] += 1
        Q[16][2] += 1
        Q[24][1] += 1
        Q[32][0] += 1
        Q[40][0] += 1
        Q[48][0] += 1
        
        Q[1][2] += 1
        Q[9][2] += 1
        Q[17][2] += 1
        Q[25][1] += 1
        Q[33][0] += 1
        Q[41][0] += 1
        Q[49][0] += 1
        
        
        # middle path 
        Q[26][1] += 1
        
        # middle blue room
        Q[3][2] += 1
        Q[11][2] += 1
        Q[19][2] += 1
        Q[27][1] += 1
        Q[35][0] += 1
        Q[43][0] += 1
        Q[51][0] += 1
        
        Q[4][2] += 1
        Q[12][2] += 1
        Q[20][2] += 1
        Q[28][1] += 1
        Q[36][0] += 1
        Q[44][0] += 1
        Q[52][0] += 1
        
        # middle path 
        Q[29][1] += 1
        
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 100)

Episode 0 return -52.0000003 discounted reward -9.9582543
Episode 1 return 83.0000003 discounted reward 8.3449003
Episode 2 return 88.0000003 discounted reward 21.0672493
Episode 3 return 79.0000003 discounted reward 2.0360893
Episode 4 return 74.0000003 discounted reward -2.8928103
Episode 5 return 94.0000003 discounted reward 48.4585103
Episode 6 return 78.0000003 discounted reward 0.8324803
Episode 7 return 77.0000003 discounted reward -0.2507683
Episode 8 return 87.0000003 discounted reward 17.9605243
Episode 9 return 65.0000003 discounted reward -7.2465293
Episode 10 return 93.0000003 discounted reward 42.6126593
Episode 11 return 91.0000003 discounted reward 32.6162543
Episode 12 return 64.0000003 discounted reward -7.5218763
Episode 13 return 88.0000003 discounted reward 21.0672493
Episode 14 return 84.0000003 discounted reward 10.3832223
Episode 15 return 74.0000003 discounted reward -2.8928103
Episode 16 return 85.0000003 discounted reward 12.6480253
Episode 17 return 77.00000

# Chain

In [11]:
env = gym.make("NChain-v0")
num_actions = 2

file_name = "log_files/NChain-q-learning/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 1000):
    Q = defaultdict(lambda: np.zeros(num_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, num_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 200)

Episode 0 return 1434.0000003 discounted reward 17.4799993
Episode 1 return 1572.0000003 discounted reward 10.3289523
Episode 2 return 1896.0000003 discounted reward 8.6802773
Episode 3 return 1932.0000003 discounted reward 8.1771843
Episode 4 return 1932.0000003 discounted reward 11.3786773
Episode 5 return 1970.0000003 discounted reward 15.3682433
Episode 6 return 2014.0000003 discounted reward 15.1859703
Episode 7 return 2054.0000003 discounted reward 22.0485943
Episode 8 return 1816.0000003 discounted reward 8.9081273
Episode 9 return 1760.0000003 discounted reward 15.3225653
Episode 10 return 2264.0000003 discounted reward 19.4932343
Episode 11 return 1950.0000003 discounted reward 11.5080763
Episode 12 return 1870.0000003 discounted reward 9.0328393
Episode 13 return 2046.0000003 discounted reward 9.1277913
Episode 14 return 2212.0000003 discounted reward 15.8519003
Episode 15 return 1834.0000003 discounted reward 7.8988933
Episode 16 return 2044.0000003 discounted reward 16.4027

Episode 139 return 1754.0000003 discounted reward 14.0185253
Episode 140 return 1830.0000003 discounted reward 7.4079073
Episode 141 return 1654.0000003 discounted reward 7.7297133
Episode 142 return 1840.0000003 discounted reward 12.9167123
Episode 143 return 1924.0000003 discounted reward 10.7482413
Episode 144 return 2042.0000003 discounted reward 10.8162133
Episode 145 return 1868.0000003 discounted reward 8.6349663
Episode 146 return 1680.0000003 discounted reward 10.7374413
Episode 147 return 1874.0000003 discounted reward 10.3041593
Episode 148 return 1962.0000003 discounted reward 11.6478913
Episode 149 return 1618.0000003 discounted reward 7.2275953
Episode 150 return 2024.0000003 discounted reward 13.2498683
Episode 151 return 1792.0000003 discounted reward 19.1674513
Episode 152 return 1988.0000003 discounted reward 7.0768333
Episode 153 return 1890.0000003 discounted reward 25.3025883
Episode 154 return 1790.0000003 discounted reward 12.9170653
Episode 155 return 1980.00000

## Human in the Loop 

In [9]:
env = gym.make("NChain-v0")
num_actions = 2

file_name = "log_files/NChain-q-learning-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 1000):
    Q = defaultdict(lambda: np.zeros(num_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, num_actions)
    env.reset()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
            
        # human modifies Q
        Q[0][0] += 0.1
        Q[1][0] += 0.1
        Q[2][0] += 0.1
        Q[3][0] += 0.1
        Q[4][0] += 0.1
        
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 200)

Episode 0 return 1516.0000003 discounted reward 15.2382153
Episode 1 return 1870.0000003 discounted reward 9.8216293
Episode 2 return 1994.0000003 discounted reward 15.3234733
Episode 3 return 2156.0000003 discounted reward 20.3731763
Episode 4 return 1810.0000003 discounted reward 10.7708423
Episode 5 return 1880.0000003 discounted reward 12.3377823
Episode 6 return 1960.0000003 discounted reward 11.6536653
Episode 7 return 1736.0000003 discounted reward 27.9075693
Episode 8 return 2106.0000003 discounted reward 41.1794553
Episode 9 return 1796.0000003 discounted reward 7.4491933
Episode 10 return 1638.0000003 discounted reward 8.7556063
Episode 11 return 1878.0000003 discounted reward 9.1340083
Episode 12 return 1938.0000003 discounted reward 10.6561583
Episode 13 return 1778.0000003 discounted reward 12.1791103
Episode 14 return 1800.0000003 discounted reward 22.4383613
Episode 15 return 1966.0000003 discounted reward 12.2937123
Episode 16 return 1566.0000003 discounted reward 8.338

Episode 140 return 1560.0000003 discounted reward 10.5432873
Episode 141 return 1766.0000003 discounted reward 13.4748173
Episode 142 return 1696.0000003 discounted reward 13.2116083
Episode 143 return 1900.0000003 discounted reward 11.1766743
Episode 144 return 1924.0000003 discounted reward 18.3996743
Episode 145 return 2428.0000003 discounted reward 12.1405463
Episode 146 return 1770.0000003 discounted reward 14.7411293
Episode 147 return 1736.0000003 discounted reward 26.0159713
Episode 148 return 1874.0000003 discounted reward 23.1774323
Episode 149 return 1934.0000003 discounted reward 7.5619203
Episode 150 return 1702.0000003 discounted reward 18.6313813
Episode 151 return 1964.0000003 discounted reward 13.6064183
Episode 152 return 2140.0000003 discounted reward 8.3448013
Episode 153 return 1658.0000003 discounted reward 8.3508973
Episode 154 return 2024.0000003 discounted reward 25.6271733
Episode 155 return 2060.0000003 discounted reward 9.7501463
Episode 156 return 2152.0000