<div class="alert alert-primary alert-info">
    
# Reinforcement Learning - Mountain Car

</div>

---
<div class="alert alert-block alert-success">

- ### SARSA ($\lambda = 0$)

- ### Q-Learning

</div>

---

In [1]:
%config IPCompleter.greedy=True
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from IPython.display import Video

import gym

import abc

import warnings
warnings.filterwarnings('ignore')

<img src='utils/MountainCar_T0.gif'/>

In [2]:
class Agent(abc.ABC):

    def __init__(self, env, num_bins, max_episodes):

        self.env = env

        self.num_bins = num_bins
        self.max_episodes = max_episodes
        self.max_episode_length = 200

        self.position_array = np.linspace(env.observation_space.low[0], env.observation_space.high[0], num=self.num_bins, endpoint=True)
        self.velocity_array = np.linspace(env.observation_space.low[1], env.observation_space.high[1], num=self.num_bins, endpoint=True)

        self.action_state_values = np.zeros((self.num_bins + 1, self.num_bins + 1, env.action_space.n))
        self.policy = np.random.randint(low=0, high=env.action_space.n, size=(self.num_bins + 1, self.num_bins + 1))

        self.eta = 0.05
        self.gamma = 0.9

        self.epsilon = 1.0
        self.epsilon_min = 0.005
        self.epsilon_decay = 500 * self.epsilon_min / (self.max_episodes * self.max_episode_length)

        
    def digitize(self, obs):
        return np.digitize(obs[0], self.position_array), np.digitize(obs[1], self.velocity_array)

    
    def select_action(self, obs):
        digitized_obs = self.digitize(obs)
        self.epsilon = max(self.epsilon_min, self.epsilon - self.epsilon_decay)
        if np.random.random() > self.epsilon:
            return self.policy[digitized_obs]
        else:
            return np.random.choice(env.action_space.n)


    @abc.abstractmethod
    def learn(self, obs, action, reward, next_obs):
        pass

---

### SARSA(0)

In [3]:
class SARSA_Agent(Agent):
    
    def learn(self, obs, action, reward, next_obs):
        digitized_obs = self.digitize(obs)
        digitized_next_obs = self.digitize(next_obs)
        next_action = self.policy[digitized_next_obs]
        action_values = reward + \
                        self.gamma * self.action_state_values[digitized_next_obs][next_action] - \
                        self.action_state_values[digitized_obs][action]
        self.action_state_values[digitized_obs][action] += self.eta * action_values
        self.policy[digitized_obs] = np.argmax(self.action_state_values[digitized_obs])    

---

### Q-Learning

In [4]:
class Q_Agent(Agent):
    
    def learn(self, obs, action, reward, next_obs):
        digitized_obs = self.digitize(obs)
        digitized_next_obs = self.digitize(next_obs)
        action_values = reward + \
                        self.gamma * np.max(self.action_state_values[digitized_next_obs]) - \
                        self.action_state_values[digitized_obs][action]
        self.action_state_values[digitized_obs][action] += self.eta * action_values
        self.policy[digitized_obs] = np.argmax(self.action_state_values[digitized_obs])

---

### Train and Test

In [5]:
def train(agent):
    best_reward = np.finfo(np.float64).min
    for episode_idx in range(agent.max_episodes):
        done, reward_per_episode = False, 0.0
        obs = agent.env.reset()
        while not done:
            action = agent.select_action(obs)
            next_obs, reward, done, info = agent.env.step(action)
            agent.learn(obs, action, reward, next_obs)
            obs = next_obs
            reward_per_episode += reward
        best_reward = max(best_reward, reward_per_episode)
        if episode_idx % 10000 == 0:
            print(f'Episode: {episode_idx+1}, Reward: {reward_per_episode}, Best_reward: {best_reward}')
    print(f'Episode: {agent.max_episodes}, Reward: {reward_per_episode}, Best_reward: {best_reward}\n')
    return np.argmax(agent.action_state_values, axis=2)


def test(agent, env, policy):
    done, total_reward = False, 0.0
    obs = env.reset()
    while not done:
        action = policy[agent.digitize(obs)]
        next_obs, reward, done, info = env.step(action)
        obs = next_obs
        total_reward += reward
    return total_reward

---

### Main

In [6]:
if __name__ == "__main__":

    env = gym.make('MountainCar-v0')

    max_episodes = np.arange(10_000, 100_001, 10_000, dtype=np.int32)
    num_bins = np.arange(20, 40, 5, dtype=np.int32)
    
    
    # SARSA Agent
    agent = SARSA_Agent(env, num_bins[0], max_episodes[-1])
    policy = train(agent)

    output_dir = './sarsa_output'
    env = gym.wrappers.Monitor(env, output_dir, force=True)
    for _ in range(1000):
        test(agent, env, policy)


    env.reset()

    
    # Q Agent
    agent = Q_Agent(env, num_bins[0], max_episodes[-1])
    policy = train(agent)

    output_dir = './q_output'
    env = gym.wrappers.Monitor(env, output_dir, force=True)
    for _ in range(1000):
        test(agent, env, policy)


    env.close()

Episode: 1, Reward: -200.0, Best_reward: -200.0
Episode: 10001, Reward: -200.0, Best_reward: -200.0
Episode: 20001, Reward: -200.0, Best_reward: -159.0
Episode: 30001, Reward: -200.0, Best_reward: -118.0
Episode: 40001, Reward: -156.0, Best_reward: -86.0
Episode: 50001, Reward: -141.0, Best_reward: -86.0
Episode: 60001, Reward: -141.0, Best_reward: -86.0
Episode: 70001, Reward: -147.0, Best_reward: -86.0
Episode: 80001, Reward: -144.0, Best_reward: -86.0
Episode: 90001, Reward: -144.0, Best_reward: -86.0
Episode: 100000, Reward: -140.0, Best_reward: -86.0

Episode: 1, Reward: -200.0, Best_reward: -200.0
Episode: 10001, Reward: -200.0, Best_reward: -200.0
Episode: 20001, Reward: -200.0, Best_reward: -149.0
Episode: 30001, Reward: -153.0, Best_reward: -117.0
Episode: 40001, Reward: -149.0, Best_reward: -88.0
Episode: 50001, Reward: -174.0, Best_reward: -88.0
Episode: 60001, Reward: -189.0, Best_reward: -86.0
Episode: 70001, Reward: -150.0, Best_reward: -86.0
Episode: 80001, Reward: -152.

---

<div class="alert alert-danger" role="alert">

### Q Agent
    
</div>

- #### During training

<img src='utils/Q_Agent_Intermediate.gif'/>

- #### After training

<img src='utils/Q_Agent.gif'/>

---

<div class="alert alert-danger" role="alert">

### SARSA Agent
    
</div>

- #### During training

<img src='utils/SARSA_Agent_Intermediate.gif'/>

- #### After training

<img src='utils/SARSA_Agent.gif'/>