# Reinforcement Learning 
With thanks to Dr Massimiliano Patacchiola for the gridworld implementation, because I'm too lazy to implement it myself

## Temporal Difference Learning



In [None]:
import random
import numpy as np
import time
import matplotlib.pyplot as plt
from gridworld import GridWorld

Gridworld is provided alongside this notebook -- we're just re-using a simple grid environment from elsewhere. 


TD(0) update function is pretty simple, we just need to take our existing estimate of V and update it according to:

$$
  V_t(s) \leftarrow V_t(s) + \alpha \left[ r(s) + \gamma V_{t+1}(s) - V_t(s) \right]
$$

In the below, our state is represented as a simple (x,y) pair.

In [None]:
def TD(value_matrix, state, new_state, reward, alpha, gamma):

    v = value_matrix[state[0], state[1]]
    v_t1 = value_matrix[new_state[0], new_state[1]]
    
    value_matrix[state[0], state[1]] += alpha * (reward + gamma * v_t1 - v)
    
    return value_matrix

Now we have our TD(0) update function defined, we can pull together the environment etc.

In [None]:
env = GridWorld(3, 4)

state_matrix = np.zeros((3,4))
state_matrix[0, 3] = 1

print("State Matrix:")
print(state_matrix)


In [None]:
reward_matrix = np.zeros((3,4))
reward_matrix[0, 3] = 1

print("Reward Matrix:")
print(reward_matrix)


In [None]:
# for probabilistic transitions:
# transition_matrix = np.array([[0.8, 0.1, 0.0, 0.1],
#                               [0.1, 0.8, 0.1, 0.0],
#                               [0.0, 0.1, 0.8, 0.1],
#                               [0.1, 0.0, 0.1, 0.8]])

transition_matrix = np.eye(4);
print(transition_matrix)


For this particular environment class, we need to provide the state / reward / transition matrices:

In [None]:
env.setStateMatrix(state_matrix)
env.setRewardMatrix(reward_matrix)
env.setTransitionMatrix(transition_matrix)

Now we can start our value iteration -- we are going to pull the loop out to keep the code (relatively) clean...

In [None]:
def TD_learning_loop(tot_epoch,
                    diff_epoch, 
                    print_epoch, 
                    env, 
                    value_matrix, 
                    alpha, 
                    gamma):

    epoch_i = 0;
    diffs = np.zeros((3,4,int(tot_epoch / diff_epoch)+1))
    prev_v = value_matrix.copy();
    
    for epoch in range(tot_epoch):
        state = env.reset(exploring_starts=False)

        for step_i in range(1000):

            action = random.randint(0,3);
            new_state, reward, done = env.step(action)
            value_matrix = TD(value_matrix, state, new_state, reward, alpha, gamma)

            state = new_state
            if done: break

        if(epoch % diff_epoch == 0):
            diff = value_matrix - prev_v;
            diffs[...,epoch_i] = diff;
            prev_v = value_matrix.copy()
            epoch_i += 1;
            print(epoch)
            
        if (epoch % print_epoch == 0):
            print("\r\nValue matrix after " + str(epoch+1) + " iterations:")
            print(value_matrix)
            
    return (value_matrix, diffs)

In [None]:
value_matrix = np.zeros((3,4))
gamma        = 0.9
alpha        = 0.1
tot_epoch    = int(1e5)
print_epoch  = int(1e4)
diff_epoch   = int(1e3)

print("\r\nValue matrix after 0 iterations:")
print(value_matrix)

tic = time.time()

(value_matrix, diffs) = TD_learning_loop(tot_epoch,
                                         diff_epoch, 
                                         print_epoch, 
                                         env, 
                                         value_matrix, 
                                         alpha, 
                                         gamma)

toc = time.time()
print("%d iterations completed in approx. %d seconds." % (tot_epoch, toc - tic))

In [None]:
print("Value matrix after " + str(tot_epoch) + " iterations:")
print(value_matrix.round(2))

In [None]:
plt.figure()
for i in range(3):
    for j in range(4):
        plt.plot(diffs[i,j,:])

Final step is to determine the optimal policy $\pi^*$ from the computed value matrix $V$, which is trivial by following:


$$
\pi^*(s) = \arg\max_a\left[ r(s,a) + \gamma V^*\left( \delta(s,a) \right) \right]
$$

# Q Learning

Recall the Q-Learning update rule:

$$
    Q'(s, a) = (1 - \alpha)Q'(s, a) + \alpha\mathopen{}\left(r + \gamma \max_{a'}Q'(s', a')\right)\mathclose{}
$$


To learn (state, action) pairs, we need a data structure which provides all permutations of (s, a). This is the Q table, which gives us all possible states against all possible actions. We can then learn the Q value for all (s, a) pairs, and store it in the Q table.

In [None]:
q_table = np.zeros((12, 4));
print(q_table)

In order to use the same environment as with TD Learning, we need to 'unroll' the state representation into a single integer (so we can index the Q table). This function doesn't need to be complicated:

In [None]:
def unroll(state, rows = 3, cols = 4):
    # default: 4col, 3row environment
    # (x, y) -> i
    
    x = state[1];
    y = state[0];
    cell_loc = cols * y + x;

    return cell_loc;

Now we can start the learning process. The code below is the same as with TD Learning, except we have replaced the TD function with the Q learning update function from above. 

In [None]:
def q_learning_loop(tot_epoch, 
                    diff_epoch, 
                    print_epoch, 
                    env, 
                    q_table, 
                    alpha, 
                    alpha_dt,
                    gamma):
    epoch_i = 0;
    diffs = np.zeros((12, 4, int(tot_epoch / diff_epoch) + 1));
    prev_q = q_table.copy();

    for epoch in range(tot_epoch):
        state = env.reset(exploring_starts=False)

        for step_i in range(1000):
            # Take an action:
            action = random.randint(0,3);
            new_state, reward, done = env.step(action);
            new_state_r = unroll(new_state);
            
            # Update Q:
            q_table[state, action] = (1 - alpha)*q_table[state, action] + \
                                         alpha * (reward + gamma * max(q_table[new_state_r,:]));
            state = new_state_r;

            if done: break

        if(epoch % diff_epoch == 0):
#             print(epoch)
            diff = np.subtract(q_table, prev_q);
            diffs[...,epoch_i] = diff;
            prev_q = q_table.copy();
            epoch_i += 1
            
        if (epoch % print_epoch == 0):
            print("\r\nQ table after " + str(epoch+1) + " iterations:")
            print(q_table.round(2))
            
        alpha = alpha - alpha_dt;
        
    return (q_table, diffs)

Now all we have to do is initialise our parameters / variables and throw them at the `q_learning_loop()` function

In [None]:
tic = time.time()
alpha_0  = 0.001
alpha_dt = 0

gamma = 0.1
tot_epoch = int(1e5)
print_epoch = int(1e4)
diff_epoch = int(1e3)
q_table = np.zeros((12, 4));

print("\r\nQ table after 0 iterations:")
print(q_table.round(2))

(q_table, diffs)  = q_learning_loop(tot_epoch, 
                                    diff_epoch, 
                                    print_epoch, 
                                    env,
                                    q_table, 
                                    alpha_0,
                                    alpha_dt,
                                    gamma)


toc = time.time()
print("%d iterations completed in %d seconds." % (tot_epoch, toc - tic))

Let's take a look at the output:

In [None]:
q_table.round(2)

As before, we can plot the changes in our Q table to show the convergence over time:

In [None]:
plt.figure()
for i in range(12):
    for j in range(4):
        plt.plot(diffs[i,j,:])
