In [1]:
import gym
import numpy as np
import time

## SARSA - State–Action–Reward–State–Action

$$
Q(s_t,a_t) = Q(s_t,a_t) + \alpha (r + \gamma Q(s_{t+1}, a_{t+1}) - Q(s_t,a_t))
$$
where<br>
$ \alpha $ - step size (learning rate)<br>
$ \gamma $ - discount factor<br>
$ s_t $ - current state<br>
$ s_{t+1} $ - next state <br>
$ r $ - reward<br>
$ a $ - action

The Q-value depends on the current state of the agent $s_t$, the action the agent chooses $a_t$, the reward $r$ the agent gets for choosing this action, the state $s_{t+1}$ that the agent will now be in after taking that action, and finally the next action $a_{t+1}$ the agent will choose in its new state.

### Comparing Q-learning and SARSA
SARSA
$$
Q(s_t,a_t) = Q(s_t,a_t) + \alpha (r + \gamma Q(s_{t+1}, a_{t+1}) - Q(s_t,a_t))
$$
Q-learning
$$
Q(s_t,a_t) = Q(s_t,a_t) + \alpha (r + \gamma \max_{a} Q(s_{t+1}, a) - Q(s_t,a_t))
$$

The major difference between SARSA and Q-Learning, is that the maximum reward for the next state is not necessarily used for updating the Q-values. Instead, a new action, and therefore reward, is selected using the same policy that determined the original action.

### Example 1:

In [6]:
env = gym.make('FrozenLake-v0')

a = .8 #alpha
y = .95 #gamma
num_episodes = 2000
Q = np.zeros([env.observation_space.n, env.action_space.n])

for i in range(num_episodes):
    
    current_state = env.reset()
    current_action = np.argmax(Q[current_state,:])
    for j in range(100):
        
        next_state, reward, done, _ = env.step(current_action)
        
        next_action = np.argmax(Q[next_state,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        
        Q[current_state, current_action] += a*(reward + y*Q[next_state, next_action] - Q[current_state, current_action])
        
        current_state = next_state
        current_action = next_action
        
        if done == True:
            break

print('Q-table:')
print(Q)

Q-table:
[[  2.96856986e-01   2.40164072e-04   5.95337846e-04   4.86136204e-04]
 [  1.52752606e-05   1.16452456e-03   7.23610020e-04   6.56125509e-02]
 [  1.72521667e-04   7.07783616e-02   1.83614554e-04   8.31396591e-04]
 [  5.09110558e-06   4.00247793e-08   1.20667714e-04   6.38800679e-02]
 [  1.99064756e-01   1.17901438e-03   3.53596104e-04   9.98988802e-05]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  1.71808538e-05   1.04510488e-04   1.50176370e-02   1.35763732e-08]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  2.09330533e-04   1.29270101e-04   4.62964037e-04   1.03072295e-01]
 [  0.00000000e+00   5.52217798e-01   1.78927846e-03   3.71852679e-05]
 [  7.49010963e-02   9.44015147e-05   0.00000000e+00   2.68498075e-04]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00   3.43941442e-01   4.28438328e-0

### Example 2:

In [16]:
env = gym.make('FrozenLake-v0')

a = .8 #alpha
y = .95 #gamma
num_episodes = 2000
Q = np.zeros([env.observation_space.n, env.action_space.n])

for i in range(num_episodes):
    visited_states = [0, ]
    choosed_actions = []
    
    current_state = env.reset()
    current_action = np.argmax(Q[current_state,:])
    for j in range(100):
        
        next_state, reward, done, _ = env.step(current_action)
        
        next_action = np.argmax(Q[next_state,:] + np.random.randn(1,env.action_space.n)*(1./(i+1)))
        
        Q[current_state, current_action] += a*(reward + y*Q[next_state, next_action] - Q[current_state, current_action])
        
        visited_states.append(current_state)
        choosed_actions.append(
        {
            0 : 'l',
            1 : 'd',
            2 : 'r',
            3 : 'u'
        }[current_action])
        
        current_state = next_state
        current_action = next_action
        
        if done == True:
            break

choosed_actions.append('-')
print('Last visited states and actions:')
print(np.array([visited_states, choosed_actions]))
print()
print('Last move:')
env.render()
print()
print('Numbers representing states:')
print(np.arange(0,16).reshape(4,4))
print()
print('Q-table:')
print(Q)

Last visited states and actions:
[['0' '0' '0' '0' '0' '4' '4' '8' '4' '8' '9' '13' '14']
 ['l' 'l' 'l' 'l' 'l' 'l' 'u' 'l' 'u' 'd' 'r' 'r' '-']]

Last move:
  (Right)
SFFF
FHFH
FFFH
HFF[41mG[0m

Numbers representing states:
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]

Q-table:
[[  4.64472731e-01   3.61962284e-05   1.43738625e-04   7.08616356e-05]
 [  1.31054962e-04   7.61097995e-04   2.12605914e-04   1.72647706e-01]
 [  2.06265009e-04   2.34825148e-01   7.40766980e-04   7.07559250e-04]
 [  6.37087438e-05   9.32407378e-04   1.56706556e-04   1.10325793e-01]
 [  6.33070719e-01   1.03994546e-04   2.17606301e-05   1.00200350e-04]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  5.50441975e-05   6.23260959e-08   3.11330846e-02   1.97457558e-07]
 [  0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00]
 [  4.73826368e-04   1.01255101e-03   1.17546810e-03   7.56133544e-01]
 [  2.25749240e-03   5.42136319e-01   1.11496939e-03   6.024673

### References

- http://www.cse.unsw.edu.au/~cs9417ml/RL1/algorithms.html
- https://studywolf.wordpress.com/2013/07/01/reinforcement-learning-sarsa-vs-q-learning/