In [1]:
import gym
import numpy as np
import sys
from collections import defaultdict
from IPython.display import clear_output
import time
import math

# Utils

In [2]:
def serialize_Q(Q, name):
    file = open(name, 'w+')
    for key in Q.keys():
        line = [f"{key}"]
        for value in Q[key]:
            line.append(f"{value}")
        file.write(';'.join(line) + '\n')
    file.close()
    
def deserialize_Q(name):
    file = open(name, 'r')
    Q = {}
    for line in file.readlines():
        line_arr = line[:-1].split(';')
        key = int(line_arr.pop(0))
        Q[key] = np.array([float(e) for e in line_arr])
    file.close()
    return Q

In [3]:
def generate_empty_dictionary(env, default_value=0.0):
    return {k:np.ones(env.action_space.n)*default_value for k in range(env.observation_space.n)}

In [4]:
def greedy_policy_from_Q(Q):
    return {k:np.argmax(v) for k, v in Q.items()}

def epsilon_greedy_for_state_action_values(Q_s, epsilon, env):
    policy_s = np.ones(env.action_space.n) * epsilon / env.action_space.n
    best_a = np.argmax(Q_s)
    policy_s[best_a] = 1 - epsilon + (epsilon / env.action_space.n)
    return policy_s

def generate_episode(env, Q=None, epsilon=None):
    greedy_policy = greedy_policy_from_Q(Q) if Q else None
    episode = []
    state = env.reset()
    while True:
        if Q is None:
            action = env.action_space.sample()
        elif Q is not None and epsilon is not None:
            action = np.random.choice(np.arange(env.action_space.n), p=epsilon_greedy_for_state_action_values(Q[state], epsilon, env)) if state in Q else env.action_space.sample()
        else:
            action = greedy_policy[state]
        next_state, reward, done, info = env.step(action)
        episode.append((state, action, reward))
        state = next_state
        if done:
            break
    return episode

In [5]:
def render_playthrough(env, Q=None, epsilon=None):
    greedy_policy = greedy_policy_from_Q(Q) if Q else None
    state = env.reset()
    rewards = 0
    clear_output(True)
    env.render()
    print(rewards)
    time.sleep(.3)
    while True:
        if greedy_policy is None:
            action = env.action_space.sample()
        elif Q is not None and epsilon is not None:
            action = np.random.choice(np.arange(env.action_space.n), p=epsilon_greedy_for_state_action_values(Q[state], epsilon, env)) if state in Q else env.action_space.sample()
        else:
            action = greedy_policy[state]
        next_state, reward, done, info = env.step(action)
        rewards += reward
        state = next_state
        clear_output(True)
        env.render()
        print(rewards)
        time.sleep(.3)
        if done:
            break

In [6]:
def benchmark(env, Q=None, epsilon=None):
    greedy_policy = greedy_policy_from_Q(Q) if Q else None
    all_rewards = []
    for i in range(100):
        state = env.reset()
        rewards = 0
        while True:
            if Q is None:
                action = env.action_space.sample()
            elif Q is not None and epsilon is not None:
                action = np.random.choice(np.arange(env.action_space.n), p=epsilon_greedy_for_state_action_values(Q[state], epsilon, env)) if state in Q else env.action_space.sample()
            else:
                action = greedy_policy[state]
            next_state, reward, done, info = env.step(action)
            rewards += reward
            state = next_state
            if done:
                all_rewards.append(rewards)
                break
    return np.average(np.array(all_rewards))

# MC Prediction

In [7]:
def mc_prediction_q(env, num_episodes, gamma=1.0):
    # initialize empty dictionaries of arrays
    returns_sum = generate_empty_dictionary(env)
    N = generate_empty_dictionary(env)
    Q = generate_empty_dictionary(env)
    # loop over episodes
    for i_episode in range(1, num_episodes+1):
        # monitor progress
        if i_episode % 100 == 0:
            clear_output(True)
            print("\rEpisode {}/{}.\n".format(i_episode, num_episodes), end="")
        
        ## TODO: complete the function
        episode = generate_episode(env)
        states = list(map(lambda x: x[0], episode))
        rewards = np.array(list(map(lambda x: x[2], episode)))
        discounts = np.array([gamma ** (i + 1) for i in range(0, len(rewards))])
        for i in range(0, len(episode)):
            s_i, a_i, r_i_next = episode[i]
            if s_i not in states[0:i]:
                N[s_i][a_i] += 1
                returns_sum[s_i][a_i] += r_i_next + gamma * sum(rewards[i + 1:] * discounts[i + 1:])
    
    for state in returns_sum.keys():
        for i in range(len(returns_sum[state])):
            if N[state][i] != 0.0:
                Q[state][i] = returns_sum[state][i] / N[state][i]
    
    return Q

In [8]:
env = gym.make('Taxi-v2')
# render_playthrough(env)

In [9]:
Q = mc_prediction_q(env, 1000)
for i in range(10):
    print(f"Action values for {i} state: {Q[i]}")

Episode 1000/1000.
Action values for 0 state: [0. 0. 0. 0. 0. 0.]
Action values for 1 state: [-379.36363636 -452.41666667 -431.4        -471.64705882 -429.5
 -453.22222222]
Action values for 2 state: [-391.92857143 -419.2        -512.42857143 -393.125      -540.2
 -411.33333333]
Action values for 3 state: [-460.875      -395.73333333 -418.58333333 -510.14285714 -439.71428571
 -383.        ]
Action values for 4 state: [-471.5        -371.53333333 -663.66666667 -576.84615385 -465.28571429
 -537.42857143]
Action values for 5 state: [0. 0. 0. 0. 0. 0.]
Action values for 6 state: [-610.66666667 -410.57142857 -588.75       -557.22222222 -413.42857143
 -493.6       ]
Action values for 7 state: [-423.9        -662.375      -334.5        -634.5        -521.
 -592.83333333]
Action values for 8 state: [-586.88888889 -281.18181818 -489.77777778 -650.         -377.875
 -572.55555556]
Action values for 9 state: [-556.14285714 -482.75       -479.625      -337.2        -488.
 -535.70588235]


In [10]:
Q_1 = mc_prediction_q(env, 1000)
Q_2 = mc_prediction_q(env, 2000)
Q_3 = mc_prediction_q(env, 3000)

print(benchmark(env, Q_1))
print(benchmark(env, Q_2))
print(benchmark(env, Q_3))

Episode 3000/3000.
-935.75
-1077.23
-990.02


## MC Control

In [11]:
def mc_control(env, num_episodes, alpha, eps=0.1, gamma=1.0):
    nA = env.action_space.n
    # initialize empty dictionary of arrays
    Q = generate_empty_dictionary(env)
    # loop over episodes
    eps_a = 1.0
    
    for i_episode in range(1, num_episodes+1):
        # monitor progress
        if i_episode % 100 == 0:
            clear_output(True)
            print("\rEpisode {}/{}.\n".format(i_episode, num_episodes), end="")
        
        ## TODO: complete the function
        eps_a = max(eps_a * 0.9999, eps)
        episode = generate_episode(env, Q, eps_a)
        
        states = list(map(lambda x: x[0], episode))
        rewards = np.array(list(map(lambda x: x[2], episode)))
        discounts = np.array([gamma**i for i in range(len(rewards)+1)])
        
        for i in range(0, len(episode)):
            s_i, a_i, r_i_next = episode[i]
            if s_i not in states[0:i]:
                Q[s_i][a_i] = Q[s_i][a_i] + alpha * (sum(rewards[i:]*discounts[:-(1+i)]) - Q[s_i][a_i])
            
    return Q

In [15]:
Q = mc_control(env, 10000, 0.01)

Episode 10000/10000.


In [16]:
print(benchmark(env, Q))

-446.73


In [13]:
render_playthrough(env, Q)

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | :[43m [0m| : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (East)
-15


KeyboardInterrupt: 

## TD Learning (Sarsa)

In [19]:
def sarsa(env, num_episodes, alpha, gamma=1.0):
    Q = generate_empty_dictionary(env)
    
    for i_episode in range(1, num_episodes+1):
        # monitor progress
        if i_episode % 100 == 0:
            clear_output(True)
            print("\rEpisode {}/{}.\n".format(i_episode, num_episodes), end="")
        
        state = env.reset()  
        epsilon = 1.0 / i_episode
        action = np.random.choice(np.arange(env.action_space.n), p=epsilon_greedy_for_state_action_values(Q[state], epsilon, env))

        for t_step in np.arange(300):
            next_state, reward, done, info = env.step(action)
            if not done:
                next_action = np.random.choice(np.arange(env.action_space.n), p=epsilon_greedy_for_state_action_values(Q[next_state], epsilon, env))
                Q[state][action] = Q[state][action] + alpha * (reward + (gamma * Q[next_state][next_action]) - Q[state][action])
                state = next_state
                action = next_action
            if done:
                Q[state][action] = Q[state][action] + alpha * (reward + (gamma * 0) - Q[state][action])
                break

    return Q

In [23]:
Q = sarsa(env, 50000, 0.05, 0.9)

Episode 50000/50000.


In [24]:
print(benchmark(env, Q))

8.74


In [26]:
render_playthrough(env, Q)

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (Dropoff)
11


## TD Learning (Q-Learning)

In [27]:
def q_learning(env, num_episodes, alpha, gamma=1.0):
    Q = generate_empty_dictionary(env)
    
    for i_episode in range(1, num_episodes+1):
        # monitor progress
        if i_episode % 100 == 0:
            clear_output(True)
            print("\rEpisode {}/{}.\n".format(i_episode, num_episodes), end="")
        
        state = env.reset()  
        epsilon = 1.0 / i_episode
        action = np.random.choice(np.arange(env.action_space.n), p=epsilon_greedy_for_state_action_values(Q[state], epsilon, env))

        for t_step in np.arange(300):
            next_state, reward, done, info = env.step(action)
            if not done:
                next_action = np.random.choice(np.arange(env.action_space.n), p=epsilon_greedy_for_state_action_values(Q[next_state], epsilon, env))
                Q[state][action] = Q[state][action] + alpha * (reward + (gamma * np.amax(Q[next_state])) - Q[state][action])
                state = next_state
                action = next_action
            if done:
                Q[state][action] = Q[state][action] + alpha * (reward + (gamma * 0) - Q[state][action])
                break

    return Q

In [28]:
Q = q_learning(env, 50000, 0.05, 0.9)

Episode 50000/50000.


In [29]:
print(benchmark(env, Q))

8.56


In [30]:
render_playthrough(env, Q)

+---------+
|R: | : :[35m[42mG[0m[0m|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)
7
