<a href="https://colab.research.google.com/github/jbpacker/deep-rl-class/blob/main/unit2/HuggingFace_Unit_2_%F0%9F%9A%96.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Q-Learning with Taxi-v3 🚕

from [hugging face deep rl class unit 2](https://colab.research.google.com/gist/simoninithomas/466c81aa1c2a07dd14793240c6d033c5/q-learning-with-taxi-v3.ipynb#scrollTo=lWYibdun-uDO)

[📜 Read the chapter](https://medium.com/@thomassimonini/an-introduction-to-deep-reinforcement-learning-17a565999c0c?source=friends_link&sk=1b1121ae5d9814a09ca38b47abc7dc61) 

[📹 Watch the chapter](https://youtu.be/q0BiUn5LiBc)

- Q-Learning Part 1

[📜 Read the chapter](https://medium.com/@thomassimonini/q-learning-lets-create-an-autonomous-taxi-part-1-2-3e8f5e764358)

[📹 Watch the chapter](https://youtu.be/230bR2DrbdE)

- Q-Learning Part 2

[📜 Read the chapter]()

In [2]:
%%capture
!pip install pyglet==1.5.1 
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

In [1]:
%%capture
!pip install git+https://github.com/openai/gym.git # We install gym using git since Taxi-v3 "rgb_array version" is not on PyPi release
!pip install pygame
!pip install numpy

!pip install huggingface_hub
!pip install pickle5
!pip install pyyaml==6.0 # avoid key error metadata

In [14]:
import numpy as np
import gym
import random
import imageio
import os
import scipy

import pickle5 as pickle

For virtual display

# Taxi

## Step 1: Create the environment 🕹️


In [None]:
env = gym.make("Taxi-v3")
env.render()

+---------+
|R: | : :[35mG[0m|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

there are 500 possible states
there are 6 possible actions


## Step 2: Create the Q-table and initialize it 🗄️

In [None]:
state_space = env.observation_space.n
action_space = env.action_space.n
print("there are {} possible states".format(state_space))
print("there are {} possible actions".format(action_space))
Q = np.zeros((state_space, action_space))

there are 500 possible states
there are 6 possible actions


## Step 3: Define the hyperparameters ⚙️

In [None]:
total_episodes = 25000        # Total number of training episodes
total_test_episodes = 100     # Total number of test episodes
max_steps = 200               # Max steps per episode

learning_rate = 0.01          # Learning rate
gamma = 0.99                  # Discounting rate

# Exploration parameters
epsilon = 1.0                 # Exploration rate
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.001            # Minimum exploration probability 
decay_rate = 0.01             # Exponential decay rate for exploration prob

## Step 4: Define the epsilon-greedy policy 🤖

In [None]:
def epsilon_greedy_policy(Q, state, epsilon):
  if (random.uniform(0,1) > epsilon):
    action = np.argmax(Q[state])
  else:
    action = env.action_space.sample()

  return action

In [None]:
for episode in range(total_episodes):
  if episode % 1000 == 1:
    print("[{}] rew = {}, steps = {}".format(episode, running_reward, step))
  state = env.reset()
  step = 0
  epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

  done = False
  running_reward = 0
  while not done and step <= max_steps:
    action = epsilon_greedy_policy(Q, state, epsilon)
    next_state, reward, done, _ = env.step(action)
    running_reward += reward

    q_max_next = np.max(Q[next_state])
    td_target = reward + gamma * q_max_next
    td_error = td_target - Q[state, action]
    Q[state, action] = Q[state, action] + learning_rate * td_error

    state = next_state
    step += 1

env.close()


## Step 6: Let's watch our autonomous taxi 🚖 

In [None]:
import time
rewards = []
frames = []

episodes = total_test_episodes
# episodes = 2

for episode in range(episodes):
    state = env.reset()
    step = 0
    done = False
    total_rewards = 0
    print("****************************************************")
    print("EPISODE ", episode)
    for step in range(max_steps):
        # env.render()     
        # Take the action (index) that have the maximum expected future reward given that state
        action = np.argmax(Q[state][:])
        new_state, reward, done, info = env.step(action)
        total_rewards += reward
        
        if done:
            rewards.append(total_rewards)
            print ("Score", total_rewards)
            break
        state = new_state
env.close()
print ("Score over time: " +  str(sum(rewards)/total_test_episodes))

# Frozen Lake

## Step 1: Create Env

In [6]:
lake_env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=False)

In [8]:
lake_env.reset()
print("Observation Space", lake_env.observation_space)
print("Sample observation", lake_env.observation_space.sample()) # Get a random observation
print("Action Space", lake_env.action_space)
print("Action Sample", lake_env.action_space.sample())

Observation Space Discrete(16)
Sample observation 6
Action Space Discrete(4)
Action Sample 2


## Step 2: Create Q Table

In [12]:
lake_Q = np.zeros((lake_env.observation_space.n, lake_env.action_space.n))
print(lake_Q.shape)
print(lake_Q)

(16, 4)
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


## Step 3: Define policy

In [17]:
# eps % chance of greedy action, otherwise look up argmax in Q
def eps_greedy(state, Q, eps):
  if (random.random() > eps):
    return lake_env.action_space.sample()
  else:
    return np.argmax(Q[state])

## Step 4: init

In [62]:
eps_start = 0 # start at totally random, go toward 0 linearly
eps_end = 1
max_steps = 400
epochs = 10000

gamma = 0.99
# lr = 0.005
lr = 0.7

eps_interp = scipy.interpolate.interp1d([0, epochs], [eps_start, eps_end])

# reset Q table to be certain
lake_Q = np.zeros((lake_env.observation_space.n, lake_env.action_space.n))


## Step 5: Train

In [63]:
def train(epochs, max_steps, eps_func, env, Q):
  # Do the dew
  for epoch in range(epochs):
    eps = eps_func(epoch) 
    state = lake_env.reset()
    for step in range(max_steps):
      # sample action and step
      action = eps_greedy(state, Q, eps)
      next_state, reward, done, info = lake_env.step(action)

      # Update Q
      td_target = reward + gamma * np.max(Q[next_state])
      td_error = td_target - Q[state, action]
      Q[state, action] = Q[state, action] + lr * td_error

      if done:
        break

      state = next_state
  return Q


In [64]:
lake_Q = train(epochs, max_steps, eps_interp, lake_env, lake_Q)
print(lake_Q)

[[0.94148015 0.95099005 0.95099005 0.94148015]
 [0.94148015 0.         0.96059601 0.95099005]
 [0.95099005 0.970299   0.95099005 0.96059601]
 [0.96059601 0.         0.95099005 0.95099005]
 [0.95099005 0.96059601 0.         0.94148015]
 [0.         0.         0.         0.        ]
 [0.         0.9801     0.         0.96059601]
 [0.         0.         0.         0.        ]
 [0.96059601 0.         0.970299   0.95099005]
 [0.96059601 0.9801     0.9801     0.        ]
 [0.970299   0.99       0.         0.970299  ]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.9801     0.99       0.970299  ]
 [0.9801     0.99       1.         0.9801    ]
 [0.         0.         0.         0.        ]]


## Step 6: Evaluate

In [60]:
def eval_agent(env, max_steps, epochs, Q, seed = None):
  rewards = []
  for epoch in range(epochs):
    if seed:
      state = env.reset(seed=seed[epoch])
    else:
      state = env.reset()

    epoch_rew = 0
    
    for step in range(max_steps):
      action = np.argmax(Q[state])
      next_state, reward, done, _ = env.step(action)

      epoch_rew += reward

      if done:
        rewards.append(epoch_rew)
        break
      state = next_state

  print(rewards)
  mean_reward = np.mean(rewards)
  std_reward = np.std(rewards)
  return mean_reward, std_reward

In [65]:
mean, std = eval_agent(lake_env, max_steps, 1, lake_Q)
print("mean: {} std: {}".format(mean, std))

[1.0]
mean: 1.0 std: 0.0


## Step 7: Capture

In [66]:
%%capture
from huggingface_hub import HfApi, HfFolder, Repository
from huggingface_hub.repocard import metadata_eval_result, metadata_save

from pathlib import Path
import datetime
import json

In [67]:
def record_video(env, Qtable, out_directory, fps=1):
  images = []  
  done = False
  state = env.reset(seed=random.randint(0,500))
  img = env.render(mode='rgb_array')
  images.append(img)
  while not done:
    # Take the action (index) that have the maximum expected future reward given that state
    action = np.argmax(Qtable[state][:])
    state, reward, done, info = env.step(action) # We directly put next_state = state for recording logic
    img = env.render(mode='rgb_array')
    images.append(img)
  imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [69]:
video_path =  "/content/replay.mp4"
record_video(lake_env, lake_Q, video_path, 1)

  self._proc.stdin.write(im.tostring())
