<div class="alert alert-primary alert-info">
    
# Reinforcement Learning - Mountain Car

</div>

---
<div class="alert alert-block alert-success">

- ### SARSA ($\lambda = 0$)

- ### Q-Learning

</div>

---

In [1]:
%config IPCompleter.greedy=True
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Video

import gym

<img src='utils/MountainCar_T0.gif'/>

---

Episode Terminated when:
- Car position $\gt$ 0.5
- Episode length $\gt$ 200

---

### SARSA(0)

In [3]:
class SARSA_Agent:

    def __init__(self, env, num_bins, max_episodes):

        self.env = env

        self.num_bins = num_bins
        self.max_episodes = max_episodes
        self.max_episode_length = 200

        self.position_array = np.linspace(env.observation_space.low[0], env.observation_space.high[0], num=self.num_bins, endpoint=True)
        self.velocity_array = np.linspace(env.observation_space.low[1], env.observation_space.high[1], num=self.num_bins, endpoint=True)

        self.action_state_values = np.zeros((self.num_bins + 1, self.num_bins + 1, env.action_space.n))
        self.policy = np.random.randint(low=0, high=env.action_space.n, size=(self.num_bins + 1, self.num_bins + 1))

        self.alpha = 0.05
        self.gamma = 0.9

        self.epsilon = 1.0
        self.epsilon_min = 0.005
        self.epsilon_decay = 500 * self.epsilon_min / (self.max_episodes * self.max_episode_length)

    def digitize(self, obs):
        return np.digitize(obs[0], self.position_array), np.digitize(obs[1], self.velocity_array)

    def select_action(self, obs):
        discrete_obs = np.digitize(obs[0], self.position_array), np.digitize(obs[1], self.velocity_array)
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay
        if np.random.random() > self.epsilon:
            return self.policy[discrete_obs]
        else:
            return np.random.choice(env.action_space.n)

    def learn(self, obs, action, reward, next_obs):
        discrete_obs = np.digitize(obs[0], self.position_array), np.digitize(obs[1], self.velocity_array)
        discrete_next_obs = np.digitize(next_obs[0], self.position_array), np.digitize(next_obs[1], self.velocity_array)
        next_action = self.policy[discrete_next_obs]
        action_values = reward + \
                        self.gamma * self.action_state_values[discrete_next_obs][next_action] - \
                        self.action_state_values[discrete_obs][action]
        self.action_state_values[discrete_obs][action] += self.alpha * action_values
        self.policy[discrete_obs] = np.argmax(self.action_state_values[discrete_obs])

---

### Q-Learning

In [4]:
class Q_Agent:

    def __init__(self, env, num_bins, max_episodes):

        self.env = env

        self.num_bins = num_bins
        self.max_episodes = max_episodes
        self.max_episode_length = 200

        self.position_array = np.linspace(env.observation_space.low[0], env.observation_space.high[0], num=self.num_bins, endpoint=True)
        self.velocity_array = np.linspace(env.observation_space.low[1], env.observation_space.high[1], num=self.num_bins, endpoint=True)

        self.action_state_values = np.zeros((self.num_bins + 1, self.num_bins + 1, env.action_space.n))

        self.alpha = 0.05
        self.gamma = 0.9

        self.epsilon = 1.0
        self.epsilon_min = 0.005
        self.epsilon_decay = 500 * self.epsilon_min / (self.max_episodes * self.max_episode_length)

    def digitize(self, obs):
        return np.digitize(obs[0], self.position_array), np.digitize(obs[1], self.velocity_array)

    def select_action(self, obs):
        discrete_obs = np.digitize(obs[0], self.position_array), np.digitize(obs[1], self.velocity_array)
        self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_decay)
        if np.random.random() > self.epsilon:
            return np.argmax(self.action_state_values[discrete_obs])
        else:
            return np.random.choice(env.action_space.n)

    def learn(self, obs, action, reward, next_obs):
        discrete_obs = np.digitize(obs[0], self.position_array), np.digitize(obs[1], self.velocity_array)
        discrete_next_obs = np.digitize(next_obs[0], self.position_array), np.digitize(next_obs[1], self.velocity_array)
        action_values = reward + \
                        self.gamma * np.max(self.action_state_values[discrete_next_obs]) - \
                        self.action_state_values[discrete_obs][action]
        self.action_state_values[discrete_obs][action] += self.alpha * action_values

---

### Train and Test

In [5]:
def train(agent):
    best_reward = np.finfo(np.float64).min
    for episode_idx in range(agent.max_episodes):
        done, total_reward = False, 0.0
        obs = agent.env.reset()
        while not done:
            action = agent.select_action(obs)
            next_obs, reward, done, info = agent.env.step(action)
            agent.learn(obs, action, reward, next_obs)
            obs = next_obs
            total_reward += reward
        best_reward = max(best_reward, total_reward)
        print('Episode: {}, Reward: {}, Best_reward: {}'.format(episode_idx+1, total_reward, best_reward))
    return np.argmax(agent.action_state_values, axis=2)


def test(agent, env, policy):
    done, total_reward = False, 0.0
    obs = env.reset()
    while not done:
        action = policy[agent.digitize(obs)]
        next_obs, reward, done, info = env.step(action)
        obs = next_obs
        total_reward += reward
    return total_reward

---

### Main

In [6]:
if __name__ == "__main__":

    env = gym.make('MountainCar-v0')

    max_episodes = np.arange(50000, 100000, 10000, dtype=np.int32)
    num_bins = np.arange(20, 40, 5, dtype=np.int32)

    
    """
    SARSA Agent
    """
    agent = SARSA_Agent(env, num_bins[0], max_episodes[0])
    policy = train(agent)

    output_dir = './sarsa_output'
    env = gym.wrappers.Monitor(env, output_dir, force=True)
    for _ in range(1000):
        test(agent, env, policy)


    env.reset()

    
    """
    Q Agent
    """
    agent = Q_Agent(env, num_bins[0], max_episodes[0])
    policy = train(agent)

    output_dir = './q_output'
    env = gym.wrappers.Monitor(env, output_dir, force=True)
    for _ in range(1000):
        test(agent, env, policy)


    env.close()

---

<div class="alert alert-danger" role="alert">

### Q Agent
    
</div>

- #### During training (On the way)

<img src='utils/Q_Agent_Intermediate.gif'/>

- #### After training

<img src='utils/Q_Agent.gif'/>

---

<div class="alert alert-danger" role="alert">

### SARSA Agent
    
</div>

- #### During training (On the way)

<img src='utils/SARSA_Agent_Intermediate.gif'/>

- #### After training

<img src='utils/SARSA_Agent.gif'/>