# Q-Learning

30-03-18

Q-Learning approach is based on the following formula:

$$
    Q(s_t,a)\leftarrow \underbrace{(1-\alpha)Q(s_t,a_t)}_{\textit{past}}+\alpha[R_t+\gamma\cdot\underset{a}{\max}\underbrace{(Q(s_{t+1},a)}_{\textit{future}})]
$$

where:
    
* $s_t$ is a state.
* $a$ is an action.
* $\alpha\in[0,1]$. The higher it is, the lesser the past is considered
* $R_t$ is the reward at time $t$.
* $\gamma$ is an arbitrary parameter.


In [None]:
from notebook_logging import log_progress

In [None]:
import gym
import time
import os
import numpy as np
import matplotlib.pyplot as plt
import numpy.random as rn
%matplotlib inline


"""
SFFF
FHFH
FFFH
HFFG
"""

gamma = 0.5
alpha = 0.01


def updateQ(Q, s_t, s_tn, a, R):
    """
    It applies Q-Learning update rule.
    Parameters:
    Q -> Q matrix
    s_tn -> new state
    R -> reward
    a -> action
    """
    a_max = np.argmax(Q[s_tn])
    Q[s_t, a] = (1 - alpha)*Q[s_t, a] + alpha * (R + gamma*Q[s_tn, a_max])
    return Q

def choose_policy(state):
    """
    It chooses the best action given the current state.
    Paramteres:
    state -> array of possible actions in the current state.
    """
    v_max = np.amax(state)
    indexes = np.arange(len(state))[state == v_max]
    rn.shuffle(indexes)
    return indexes[0]

In [None]:
Q = np.zeros((16, 4))
acc = 0;
Qs = [] # Weights matrices
Rs = [] # Final rewards
Prs = [] # Partial rewards
Ts = [] # Steps per episode

#best_policy = [0, 3, 3, 3, 0, 3, 0, 1, 3, 1, 0, 0, 2, 2, 1, 1] 
from gym import wrappers
env = gym.make('FrozenLake-v0')
env = wrappers.Monitor(env, '/tmp/frozenlake-experiment-1', force=True)
for i_episode in log_progress(range(5000)):
    s_old = env.reset()
    acc_rew = 0

    for t in range(100000):
        #env.render()
        policy = choose_policy(Q[s_old])
        s_new, reward, done, info = env.step(policy)
        if done:
            Q = updateQ(Q, s_old, s_new, policy, reward - 0.5)
            Qs.append(Q)
            Rs.append(reward - 0.5)
            Ts.append(t)
            acc_rew += reward - 0.5
            Prs.append(acc_rew)
            break
        else:
            Q = updateQ(Q, s_old, s_new, policy, reward - 0.07)
            s_old = s_new
            acc_rew += reward - 0.07


time.sleep(1)
env.close()
Rs = np.array(Rs)
Ts = np.array(Ts)
Prs = np.array(Prs)

In [None]:
len(Qs)

In [None]:
plt.figure(figsize=(15,5))

fit = Rs / Ts
print("Rs %f Ts %f mFit %f" % (Rs.sum(), Ts.sum(), fit.mean()))
plt.plot(fit)

In [None]:
plt.figure(figsize=(15,5))

fit = Prs / Ts
print("mFit %f" % (fit.mean()))
plt.plot(fit)

In [None]:
print("Policy learned: {}".format(np.argmax(Q, axis=1)))

## Policies

$\gamma = 0.5,\alpha = 0.01$

**[0 3 3 3 0 x 0 x 3 1 0 x x 2 1 x] <- GA best**  
[0 3 3 3 0 0 2 0 3 1 0 0 0 2 2 0] <- 5000 episodes  
[0 3 3 3 0 0 0 0 3 1 0 0 0 2 2 0] <- 100000 episodes  
[0 3 3 3 0 0 0 0 3 1 0 0 0 2 3 0] <- 150000 episodes  
