Pseudo code

<img src="images/sarsa.png" style="float: left;" width="600">

Directions:

1) initialize state S

2) initialize first action A from state S by some exploratory policy (e.g. ϵ-greedy)

For every step of the episode:

3) take action A and observe R and S′

4) choose action A′ from state S′ by some exploratory policy (e.g. ϵ-greedy)

5) do the update Q(S,A) = Q(S,A) + α(R + γ∗Q(S′,A′) − Q(S,A))

6) update state and action S=S′, A=A′ and keep looping from step 3 until end of the episode

In [17]:
import numpy as np


class SARSA:
    """
    SARSA reinforcement learning agent.

    Arguments:
      epsilon - (float) The probability of randomly exploring the action space
        rather than exploiting the best action.
      discount - (float) The discount factor. Controls the perceived value of
        future reward relative to short-term reward.
      adaptive - (bool) Whether to use an adaptive policy for setting
        values of epsilon during training
    """

    def __init__(self, epsilon=0.2, discount=0.95, adaptive=False):
        self.epsilon = epsilon
        self.discount = discount
        self.adaptive = adaptive

    def fit(self, env, steps=1000):
        state_action_values = np.zeros(
            (env.observation_space.n, env.action_space.n))
        N_actions_performed = np.zeros((env.action_space.n, ), dtype=int)
        state = env.reset()
        rewards = np.zeros((100, ))

        s = np.floor(steps / 100)
        s_count = 0
        reward_sum = 0
        idx = 0
        
        # initialize first action
        action = env.action_space.sample(
        )  # your agent here (this takes random actions)
        N_actions_performed[action] += 1
        
        # track old values
        prev_observation = state
        prev_action = action

        for step in range(steps):
            # take action and observe R, S'
            observation, reward, done, info = env.step(action)
            
            # choose next action
            epsilon = self._get_epsilon(step / steps)
            # generate random num
            p = np.random.random()
            # check probability
            action = env.action_space.sample(
            )  # your agent here (this takes random actions)
            if p >= epsilon and len(set(state_action_values[state])) != 1:
                action = np.argmax(state_action_values[state])
            
            # update values
            N_actions_performed[action] += 1
            state_action_values[prev_observation][
                prev_action] += 1 / N_actions_performed[prev_action] * (
                    reward +
                    self.discount * state_action_values[observation][action] -
                    state_action_values[prev_observation][prev_action])
            reward_sum += reward
            # set next state
            state = observation
            # check s
            s_count += 1
            if s == s_count:
                rewards[idx] = reward_sum / (step + 1)
                s_count = 0
                idx += 1
                
            prev_observation = observation
            prev_action = action

            if done:
                state = env.reset()

        return state_action_values, rewards

    def predict(self, env, state_action_values):
        states, actions, rewards = [], [], []

        state = env.reset()

        while True:
            action = np.argmax(state_action_values[state])
            # take action and observe R, S'
            observation, reward, done, info = env.step(action)
            # set next state
            state = observation
            # record data
            states.append(observation)
            actions.append(action)
            rewards.append(reward)

            if done:
                break

        return np.array(states), np.array(actions), np.array(rewards)

    def _get_epsilon(self, progress):
        return self._adaptive_epsilon(
            progress) if self.adaptive else self.epsilon

    def _adaptive_epsilon(self, progress):
        return (1 - progress) * self.epsilon


In [23]:
import gym
import slot_machines
%matplotlib notebook
import matplotlib.pyplot as plt
from learners.multi_armed_bandit import MultiArmedBandit

b_trials = []
q_trials = []
env = gym.make('SlotMachines-v0')

for i in range(10):
    print(f'beginning trial {i + 1}')
    agent = MultiArmedBandit()
    action_values, rewards = agent.fit(env, steps = 100000)
    b_trials.append(rewards)

    agent = SARSA()
    action_values, rewards = agent.fit(env, steps = 100000)
    q_trials.append(rewards)


## plotting
plt.title('SlotMachines-v0')
plt.plot(np.average(q_trials[:10], axis=0), label='10 trials averaged - SARSA')
plt.plot(np.average(b_trials[:10], axis=0), label='10 trials averaged - multi bandit')
plt.xlabel('s steps per tick')
plt.ylabel('reward average')
plt.legend()
plt.show()

beginning trial 1
beginning trial 2
beginning trial 3
beginning trial 4
beginning trial 5
beginning trial 6
beginning trial 7
beginning trial 8
beginning trial 9
beginning trial 10


<IPython.core.display.Javascript object>

In [32]:
import gym
%matplotlib notebook
import matplotlib.pyplot as plt
from learners.q_learning import QLearning


b_trials = []
q_trials = []
env = gym.make('FrozenLake-v0')

for i in range(10):
    print(f'beginning trial {i + 1}')
    agent = SARSA()
    action_values, rewards = agent.fit(env, steps = 100000)
    b_trials.append(rewards)

    agent = QLearning()
    action_values, rewards = agent.fit(env, steps = 100000)
    q_trials.append(rewards)


## plotting
plt.title('FrozenLake-v0')
plt.plot(np.average(b_trials[:10], axis=0), label='10 trials averaged - SARSA')
plt.plot(np.average(q_trials[:10], axis=0), label='10 trials averaged - Q learning')
plt.xlabel('s steps per tick')
plt.ylabel('reward average')
plt.legend()
plt.show()

beginning trial 1
beginning trial 2
beginning trial 3
beginning trial 4
beginning trial 5
beginning trial 6
beginning trial 7
beginning trial 8
beginning trial 9
beginning trial 10


<IPython.core.display.Javascript object>

In [24]:
import gym
%matplotlib notebook
import matplotlib.pyplot as plt

s_trials = []
b_trials = []
g_trials = []
env = gym.make('FrozenLake-v0')

for i in range(10):
    print(f'beginning trial {i + 1}')
    agent = SARSA(epsilon=0.01)
    action_values, rewards = agent.fit(env, steps = 100000)
    s_trials.append(rewards)

    agent = SARSA(epsilon=0.5)
    action_values, rewards = agent.fit(env, steps = 100000)
    b_trials.append(rewards)

    agent = SARSA(epsilon=0.5, adaptive=True)
    action_values, rewards = agent.fit(env, steps = 100000)
    g_trials.append(rewards)

## plotting
plt.plot(np.average(s_trials[:10], axis=0), label='10 trials averaged - epsilon = 0.01')
plt.plot(np.average(b_trials[:10], axis=0), label='10 trials averaged - epsilon = 0.5')
plt.plot(np.average(g_trials[:10], axis=0), label='10 trials averaged - epsilon = 0.5, adaptive = True')
plt.title('FrozenLake-v0')
plt.xlabel('s steps per tick')
plt.ylabel('reward average')
plt.legend()
plt.show()

beginning trial 1
beginning trial 2
beginning trial 3
beginning trial 4
beginning trial 5
beginning trial 6
beginning trial 7
beginning trial 8
beginning trial 9
beginning trial 10


<IPython.core.display.Javascript object>

In [35]:
import gym
import slot_machines
%matplotlib notebook
import matplotlib.pyplot as plt
from learners.q_learning import QLearning

b_trials = []
q_trials = []
env = gym.make('CliffWalking-v0')

for i in range(10):
    print(f'beginning trial {i + 1}')
    agent = SARSA()
    action_values, rewards = agent.fit(env, steps = 100000)
    b_trials.append(rewards)

    agent = QLearning()
    action_values, rewards = agent.fit(env, steps = 100000)
    q_trials.append(rewards)


## plotting
plt.title('CliffWalking-v0')
plt.plot(np.average(b_trials[:10], axis=0), label='10 trials averaged - SARSA')
plt.plot(np.average(q_trials[:10], axis=0), label='10 trials averaged - Q learning')
plt.xlabel('s steps per tick')
plt.ylabel('reward average')
plt.legend()
plt.show()

beginning trial 1
beginning trial 2
beginning trial 3
beginning trial 4
beginning trial 5
beginning trial 6
beginning trial 7
beginning trial 8
beginning trial 9
beginning trial 10


<IPython.core.display.Javascript object>