In [1]:
import numpy as np
import gym
import time
import csv
import gym_gridworld
import itertools
from collections import defaultdict

In [4]:
env = gym.make("GridWorld-v0")
file_name = "log_files/GridWorld-q-learning/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 50):
    Q = defaultdict(lambda: np.zeros(env.n_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, env.n_actions)
    env.reset()
    env.render()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 100)

Episode 0 return -52.0000003 discounted reward -9.9582543
Episode 1 return -52.0000003 discounted reward -9.9582543
Episode 2 return -52.0000003 discounted reward -9.9582543
Episode 3 return -52.0000003 discounted reward -9.9582543
Episode 4 return -52.0000003 discounted reward -9.9582543
Episode 5 return 51.0000003 discounted reward -9.3700943
Episode 6 return -52.0000003 discounted reward -9.9582543
Episode 7 return -52.0000003 discounted reward -9.9582543
Episode 8 return -52.0000003 discounted reward -9.9582543
Episode 9 return -52.0000003 discounted reward -9.9582543
Episode 10 return -52.0000003 discounted reward -9.9582543
Episode 11 return 86.0000003 discounted reward 15.1644723
Episode 12 return 50.0000003 discounted reward -9.4330853
Episode 13 return 75.0000003 discounted reward -2.1031223
Episode 14 return 69.0000003 discounted reward -5.8032753
Episode 15 return 64.0000003 discounted reward -7.5218763
Episode 16 return 95.0000003 discounted reward 54.9539003
Episode 17 ret

## Human in the Loop 

In [5]:
env = gym.make("GridWorld-v0")
file_name = "log_files/GridWorld-q-learning-human/" + str(time.time()) + ".csv"
with open(file_name, 'w+') as outfile:
    writer = csv.writer(outfile, delimiter=",")
    writer.writerow(["r", "l", "t"])
    
def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.2, epsilon=0.5, max_steps = 50):
    Q = defaultdict(lambda: np.zeros(env.n_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, env.n_actions)
    env.reset()
    env.render()
    start = time.time()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        # human modifies the Q
        # left red room
        Q[0][2] += 1
        Q[8][2] += 1
        Q[16][2] += 1
        Q[24][1] += 1
        Q[32][0] += 1
        Q[40][0] += 1
        Q[48][0] += 1
        
        Q[1][2] += 1
        Q[9][2] += 1
        Q[17][2] += 1
        Q[25][1] += 1
        Q[33][0] += 1
        Q[41][0] += 1
        Q[49][0] += 1
        
        
        # middle path 
        Q[26][1] += 1
        
        # middle blue room
        Q[3][2] += 1
        Q[11][2] += 1
        Q[19][2] += 1
        Q[27][1] += 1
        Q[35][0] += 1
        Q[43][0] += 1
        Q[51][0] += 1
        
        Q[4][2] += 1
        Q[12][2] += 1
        Q[20][2] += 1
        Q[28][1] += 1
        Q[36][0] += 1
        Q[44][0] += 1
        Q[52][0] += 1
        
        # middle path 
        Q[29][1] += 1
        
        runtime = time.time() - start
        with open(file_name, 'a') as outfile:
            writer = csv.writer(outfile, delimiter=",")
            writer.writerow((str(total_reward), str(i), str(runtime)))
    return Q

Q = q_learning(env, 100)

Episode 0 return -52.0000003 discounted reward -9.9582543
Episode 1 return 88.0000003 discounted reward 21.0672493
Episode 2 return 85.0000003 discounted reward 12.6480253
Episode 3 return 91.0000003 discounted reward 32.6162543
Episode 4 return 87.0000003 discounted reward 17.9605243
Episode 5 return 90.0000003 discounted reward 28.3546283
Episode 6 return 91.0000003 discounted reward 32.6162543
Episode 7 return 86.0000003 discounted reward 15.1644723
Episode 8 return 91.0000003 discounted reward 32.6162543
Episode 9 return 89.0000003 discounted reward 24.5191663
Episode 10 return 94.0000003 discounted reward 48.4585103
Episode 11 return 92.0000003 discounted reward 37.3513933
Episode 12 return 83.0000003 discounted reward 8.3449003
Episode 13 return 92.0000003 discounted reward 37.3513933
Episode 14 return 86.0000003 discounted reward 15.1644723
Episode 15 return 93.0000003 discounted reward 42.6126593
Episode 16 return 91.0000003 discounted reward 32.6162543
Episode 17 return 93.000