In [1]:
import numpy as np
import gym
import gym_gridworld
import itertools
from collections import defaultdict

In [2]:
env = gym.make("GridWorld-v0")

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.1, epsilon=0, max_steps = 10):
    Q = defaultdict(lambda: np.zeros(env.n_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, env.n_actions)
    env.reset()
    env.render()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
    return Q

Q = q_learning(env, 50)

Episode 0 return -12.0000003 discounted reward -7.1757053
Episode 1 return -12.0000003 discounted reward -7.1757053
Episode 2 return -12.0000003 discounted reward -7.1757053
Episode 3 return 95.0000003 discounted reward 54.9539003
Episode 4 return -12.0000003 discounted reward -7.1757053
Episode 5 return -12.0000003 discounted reward -7.1757053
Episode 6 return -12.0000003 discounted reward -7.1757053
Episode 7 return -12.0000003 discounted reward -7.1757053
Episode 8 return -12.0000003 discounted reward -7.1757053
Episode 9 return -12.0000003 discounted reward -7.1757053
Episode 10 return 92.0000003 discounted reward 37.3513933
Episode 11 return -12.0000003 discounted reward -7.1757053
Episode 12 return 95.0000003 discounted reward 54.9539003
Episode 13 return -12.0000003 discounted reward -7.1757053
Episode 14 return 97.0000003 discounted reward 70.1900003
Episode 15 return -12.0000003 discounted reward -7.1757053
Episode 16 return 97.0000003 discounted reward 70.1900003
Episode 17 r

## Human in the Loop 

In [3]:
env = gym.make("GridWorld-v0")

def make_epsilon_greedy_policy(Q, epsilon, nA):
    def policy_fn(observation):
        A = np.ones(nA, dtype=float) * epsilon / nA
        best_action = np.argmax(Q[observation])
        A[best_action] += (1.0 - epsilon)
        return A
    return policy_fn

def q_learning(env, num_episodes, discount_factor=0.9, alpha=0.5, epsilon=0.1, max_steps = 10):
    Q = defaultdict(lambda: np.zeros(env.n_actions))
    policy = make_epsilon_greedy_policy(Q, epsilon, env.n_actions)
    env.reset()
    env.render()
    for i_episode in range(num_episodes):
        observation = env.reset()
        discounted_reward = 0
        total_reward = 0
        for i in itertools.count():
            a_prob = policy(observation)
            a = np.random.choice([i for i in range(len(a_prob))], p = a_prob)
            next_observation, reward, done, _ = env.step(a)
            best_next_a = np.argmax(Q[next_observation])
            Q[observation][a] += alpha * (reward + discount_factor * Q[next_observation][best_next_a] - Q[observation][a])
            discounted_reward += (discount_factor**i)*reward
            total_reward += reward
            if done or i > max_steps:
                print('Episode %d return %f3 discounted reward %f3' %(i_episode, total_reward, discounted_reward))
                break
            observation = next_observation
        # human modifies the Q
        # left red room
        Q[0][2] += 1
        Q[5][2] += 1
        Q[10][1] += 1
        Q[15][0] += 1
        Q[20][0] += 1
        
        Q[11][1] += 1
        
        # middle blue room
        Q[2][2] += 1
        Q[7][2] += 1
        Q[12][1] += 1
        Q[17][0] += 1
        Q[23][0] += 1
        
        Q[13][1] += 1
    return Q

Q = q_learning(env, 50)

Episode 0 return -12.0000003 discounted reward -7.1757053
Episode 1 return 97.0000003 discounted reward 70.1900003
Episode 2 return 93.0000003 discounted reward 42.6126593
Episode 3 return 97.0000003 discounted reward 70.1900003
Episode 4 return 97.0000003 discounted reward 70.1900003
Episode 5 return 96.0000003 discounted reward 62.1710003
Episode 6 return 97.0000003 discounted reward 70.1900003
Episode 7 return 95.0000003 discounted reward 54.9539003
Episode 8 return 97.0000003 discounted reward 70.1900003
Episode 9 return 97.0000003 discounted reward 70.1900003
Episode 10 return 97.0000003 discounted reward 70.1900003
Episode 11 return 97.0000003 discounted reward 70.1900003
Episode 12 return 96.0000003 discounted reward 62.1710003
Episode 13 return 95.0000003 discounted reward 54.9539003
Episode 14 return 97.0000003 discounted reward 70.1900003
Episode 15 return 97.0000003 discounted reward 70.1900003
Episode 16 return 97.0000003 discounted reward 70.1900003
Episode 17 return 97.00