In [1]:
%%capture
!pip install pyglet==1.5.1
!apt install python-opengl
!apt install ffmpeg
!sudo apt-get update
!apt install xvfb
!pip3 install pyvirtualdisplay

In [2]:
%%capture
!pip install gym==0.24
!pip install pygame
!pip install numpy
!pip install huggingface_hub
!pip install pickle5
!pip install pyyaml==6.0
!pip install imageio imageio_ffmpeg

In [3]:
# Virtual Display
from pyvirtualdisplay import Display
virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

<pyvirtualdisplay.display.Display at 0x7f3765234890>

In [4]:
import numpy as np
import gym
import random
import imageio
import os

import pickle5 as pickle

In [None]:
env = gym.make('FrozenLake-v1', map_name = '4x4', is_slippery = False)

In [None]:
env.reset()
print(f'Observation space: {env.observation_space}')
print(f'Sample observation: {env.observation_space.sample()}')

Observation space: Discrete(16)
Sample observation: 7


In [None]:
print(f'Action space: {env.action_space}')
print(f'Sample action: {env.action_space.sample()}')

Action space: Discrete(4)
Sample action: 0


In [None]:
state_space = env.observation_space.n
action_space = env.action_space.n

print(f'There are {state_space} possible states')
print(f'There are {action_space} possible actions')

There are 16 possible states
There are 4 possible actions


In [11]:
# Create and initialize Q table of size(state_space, action_space)
def initialize_q_table(state_space, action_space):
  Qtable = np.zeros((state_space, action_space))
  return Qtable


In [None]:
Qtable_frozenlake = initialize_q_table(state_space, action_space)

In [7]:
def epsilon_greedy_policy(Qtable, state, epsilon):
  # Randomly generate a number between 0 and 1
  random_num = random.uniform(0,1)
  # if random number is greater than epsilon --> exploitation
  if random_num > epsilon:
    # Take action with highest value for given state
    action = np.argmax(Qtable[state])
    # Else --> exploration
  else:
    action = env.action_space.sample()

  return action

In [8]:
def greedy_policy(Qtable, state):
  # Pure exploitatiion, always choose action with highest value for given state\
  action = np.argmax(Qtable[state])
  return action

In [None]:
# Define hyperparameters

# Training parameters
n_training_episodes = 100000 # Total no. of training episodes
learning_rate = .03

# Evaluation parameters
n_eval_episodes = 1000      # Total no. of testing episodes

# Environment parameters
env_id = 'FrozenLake-v1'
max_steps = 70    # Max. no. of steps per episode
gamma = 0.95      # Discounting rate
eval_seed = []    # The evaluation seed of the environment

# Exploration parameters
epsilon = 1.0       # Exploration rate
max_epsilon = 1.0   # Exploration probability at start
min_epsilon = 0.05  # Minimum exploration probability
decay_rate = 0.0005 # Rate of exponential decay

In [9]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
  for episode in range(n_training_episodes):
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    # Reset the environment
    state = env.reset()
    step = 0
    done = False

    # repeat
    for step in range(max_steps):
      # Choose the action At using epsilon greedy policy
      action = epsilon_greedy_policy(Qtable, state, epsilon)

      # Take action At and observe Rt+1 and St+1
      # Take the action (a) and observe the outcome state(s') and reward (r)
      new_state, reward, done, info = env.step(action)

      # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
      Qtable[state][action] = Qtable[state][action] + learning_rate * (reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action])   

      # If done, finish the episode
      if done:
        break
      
      # Our state is the new state
      state = new_state
  return Qtable


In [None]:
Qtable_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon,
                          decay_rate, env, max_steps, Qtable_frozenlake)

In [None]:
Qtable_frozenlake

array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.65958537, 0.        , 0.81450625, 0.61526152],
       [0.61244442, 0.857375  , 0.3524636 , 0.64353844],
       [0.63295379, 0.        , 0.04591696, 0.00735428],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.72637352],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.90249587, 0.95      , 0.85737235],
       [0.9025    , 0.95      , 1.        , 0.9025    ],
       [0.        , 0.        , 0.        , 0.        ]])

In [20]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
  """
  Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
  :param env: The evaluation environment
  :param n_eval_episodes: Number of episode to evaluate the agent
  :param Q: The Q-table
  :param seed: The evaluation seed array (for taxi-v3)
  """
  episode_rewards = []
  for episode in range(n_eval_episodes):
    if seed:
      state = env.reset(seed=seed[episode])
    else:
      state = env.reset()
    step = 0
    total_rewards_ep = 0

    for step in range(max_steps):
      action = np.argmax(Q[state][:])
      new_state, reward, done, info = env.step(action)
      total_rewards_ep += reward

      if done:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward


In [21]:
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Qtable_frozenlake, eval_seed)
print(f'Mean reward: {mean_reward} +/- {std_reward}')

NameError: ignored

In [19]:
%%capture
from huggingface_hub import HfApi, HfFolder, Repository
from huggingface_hub.repocard import metadata_eval_result, metadata_save

from pathlib import Path
import datetime
import json

In [22]:
def record_video(env, Qtable, out_directory, fps=1):
  images = []  
  done = False
  state = env.reset(seed=random.randint(0,500))
  img = env.render(mode='rgb_array')
  images.append(img)
  while not done:
    # Take the action (index) that have the maximum expected future reward given that state
    action = np.argmax(Qtable[state][:])
    state, reward, done, info = env.step(action) # We directly put next_state = state for recording logic
    img = env.render(mode='rgb_array')
    images.append(img)
  imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [23]:
def push_to_hub(repo_id, 
                model,
                env,
                video_fps=1,
                local_repo_path="hub",
                commit_message="Push Q-Learning agent to Hub",
                token= None
                ):
  _, repo_name = repo_id.split("/")

  eval_env = env
  
  # Step 1: Clone or create the repo
  # Create the repo (or clone its content if it's nonempty)
  api = HfApi()
  
  repo_url = api.create_repo(
        repo_id=repo_id,
        token=token,
        private=False,
        exist_ok=True,)
  
  # Git pull
  repo_local_path = Path(local_repo_path) / repo_name
  repo = Repository(repo_local_path, clone_from=repo_url, use_auth_token=True)
  repo.git_pull()
  
  repo.lfs_track(["*.mp4"])

  # Step 1: Save the model
  if env.spec.kwargs.get("map_name"):
    model["map_name"] = env.spec.kwargs.get("map_name")
    if env.spec.kwargs.get("is_slippery", "") == False:
      model["slippery"] = False

  print(model)
  
    
  # Pickle the model
  with open(Path(repo_local_path)/'q-learning.pkl', 'wb') as f:
    pickle.dump(model, f)
  
  # Step 2: Evaluate the model and build JSON
  mean_reward, std_reward = evaluate_agent(eval_env, model["max_steps"], model["n_eval_episodes"], model["qtable"], model["eval_seed"])

  # First get datetime
  eval_datetime = datetime.datetime.now()
  eval_form_datetime = eval_datetime.isoformat()

  evaluate_data = {
        "env_id": model["env_id"], 
        "mean_reward": mean_reward,
        "n_eval_episodes": model["n_eval_episodes"],
        "eval_datetime": eval_form_datetime,
  }
  # Write a JSON file
  with open(Path(repo_local_path) / "results.json", "w") as outfile:
      json.dump(evaluate_data, outfile)

  # Step 3: Create the model card
  # Env id
  env_name = model["env_id"]
  if env.spec.kwargs.get("map_name"):
    env_name += "-" + env.spec.kwargs.get("map_name")

  if env.spec.kwargs.get("is_slippery", "") == False:
    env_name += "-" + "no_slippery"

  metadata = {}
  metadata["tags"] = [
        env_name,
        "q-learning",
        "reinforcement-learning",
        "custom-implementation"
    ]

  # Add metrics
  eval = metadata_eval_result(
      model_pretty_name=repo_name,
      task_pretty_name="reinforcement-learning",
      task_id="reinforcement-learning",
      metrics_pretty_name="mean_reward",
      metrics_id="mean_reward",
      metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
      dataset_pretty_name=env_name,
      dataset_id=env_name,
    )

  # Merges both dictionaries
  metadata = {**metadata, **eval}

  model_card = f"""
  # **Q-Learning** Agent playing **{env_id}**
  This is a trained model of a **Q-Learning** agent playing **{env_id}** .
  """

  model_card += """
  ## Usage
  ```python
  """

  model_card += f"""model = load_from_hub(repo_id="{repo_id}", filename="q-learning.pkl")

  # Don't forget to check if you need to add additional attributes (is_slippery=False etc)
  env = gym.make(model["env_id"])

  evaluate_agent(env, model["max_steps"], model["n_eval_episodes"], model["qtable"], model["eval_seed"])
  """

  model_card +="""
  ```
  """

  readme_path = repo_local_path / "README.md"
  readme = ""
  if readme_path.exists():
      with readme_path.open("r", encoding="utf8") as f:
        readme = f.read()
  else:
    readme = model_card

  with readme_path.open("w", encoding="utf-8") as f:
    f.write(readme)

  # Save our metrics to Readme metadata
  metadata_save(readme_path, metadata)

  # Step 4: Record a video
  video_path =  repo_local_path / "replay.mp4"
  record_video(env, model["qtable"], video_path, video_fps)
  
  # Push everything to hub
  print(f"Pushing repo {repo_name} to the Hugging Face Hub")
  repo.push_to_hub(commit_message=commit_message)

  print(f"Your model is pushed to the hub. You can view your model here: {repo_url}")

In [24]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [None]:
model = {
    "env_id": env_id,
    "max_steps": max_steps,
    "n_training_episodes": n_training_episodes,
    "n_eval_episodes": n_eval_episodes,
    "eval_seed": eval_seed,

    "learning_rate": learning_rate,
    "gamma": gamma,

    "epsilon": epsilon,
    "max_epsilon": max_epsilon,
    "min_epsilon": min_epsilon,
    "decay_rate": decay_rate,

    "qtable": Qtable_frozenlake
}

In [None]:
model

{'decay_rate': 0.0005,
 'env_id': 'FrozenLake-v1',
 'epsilon': 1.0,
 'eval_seed': [],
 'gamma': 0.95,
 'learning_rate': 0.03,
 'max_epsilon': 1.0,
 'max_steps': 70,
 'min_epsilon': 0.05,
 'n_eval_episodes': 1000,
 'n_training_episodes': 100000,
 'qtable': array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
        [0.65958537, 0.        , 0.81450625, 0.61526152],
        [0.61244442, 0.857375  , 0.3524636 , 0.64353844],
        [0.63295379, 0.        , 0.04591696, 0.00735428],
        [0.77378094, 0.81450625, 0.        , 0.73509189],
        [0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.9025    , 0.        , 0.72637352],
        [0.        , 0.        , 0.        , 0.        ],
        [0.81450625, 0.        , 0.857375  , 0.77378094],
        [0.81450625, 0.9025    , 0.9025    , 0.        ],
        [0.857375  , 0.95      , 0.        , 0.857375  ],
        [0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.  

In [None]:
username = "spacestar1705" # FILL THIS
repo_name = "q-FrozenLake-v1-4x4-noSlippery"
push_to_hub(
    repo_id=f"{username}/{repo_name}",
    model=model,
    env=env)

{'env_id': 'FrozenLake-v1', 'max_steps': 70, 'n_training_episodes': 100000, 'n_eval_episodes': 1000, 'eval_seed': [], 'learning_rate': 0.03, 'gamma': 0.95, 'epsilon': 1.0, 'max_epsilon': 1.0, 'min_epsilon': 0.05, 'decay_rate': 0.0005, 'qtable': array([[0.73509189, 0.77378094, 0.77378094, 0.73509189],
       [0.65958537, 0.        , 0.81450625, 0.61526152],
       [0.61244442, 0.857375  , 0.3524636 , 0.64353844],
       [0.63295379, 0.        , 0.04591696, 0.00735428],
       [0.77378094, 0.81450625, 0.        , 0.73509189],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.9025    , 0.        , 0.72637352],
       [0.        , 0.        , 0.        , 0.        ],
       [0.81450625, 0.        , 0.857375  , 0.77378094],
       [0.81450625, 0.9025    , 0.9025    , 0.        ],
       [0.857375  , 0.95      , 0.        , 0.857375  ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.    

  self._proc.stdin.write(im.tostring())


Pushing repo q-FrozenLake-v1-4x4-noSlippery to the Hugging Face Hub


Upload file replay.mp4:  12%|#1        | 3.34k/28.0k [00:00<?, ?B/s]

Upload file q-learning.pkl: 100%|##########| 936/936 [00:00<?, ?B/s]

Your model is pushed to the hub. You can view your model here: https://huggingface.co/spacestar1705/q-FrozenLake-v1-4x4-noSlippery


In [5]:
env = gym.make('Taxi-v3')

In [6]:
state_space = env.observation_space.n
action_space = env.action_space.n
print(f'There are {state_space} possible states')
print(f'There are {action_space} possible actions')

There are 500 possible states
There are 6 possible actions


In [12]:
# Create Q table with state size rows and action size columns
Qtable_taxi = initialize_q_table(state_space, action_space)
print(Qtable_taxi)
print(f'Q-table shape: {Qtable_taxi.shape}')

[[0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0.]]
Q-table shape: (500, 6)


In [50]:
# Training parameters
n_training_episodes = 1000000    # Total training episodes
learning_rate = 0.07            # Learning rate

# Evaluation parameters
n_eval_episodes = 100

eval_seed = [16,54,165,177,191,191,120,80,149,178,48,38,6,125,174,73,50,172,100,148,146,6,25,40,68,148,49,167,9,97,164,176,61,7,54,55,
 161,131,184,51,170,12,120,113,95,126,51,98,36,135,54,82,45,95,89,59,95,124,9,113,58,85,51,134,121,169,105,21,30,11,50,65,12,43,82,145,152,97,106,55,31,85,38,
 112,102,168,123,97,21,83,158,26,80,63,5,81,32,11,28,148] # Evaluation seed, this ensures that all classmates agents are trained on the same taxi starting position
                                                          # Each seed has a specific starting state

# Environment parameters
env_id = 'Taxi-v3'              # Name of environment
max_steps = 99                  # Max steps per episode
gamma = 0.99                    # Discount rate

# Exploration parameters
epsilon = 1.0
max_epsilon= 1.0
min_epsilon = 0.05
decay_rate = 0.005



In [43]:
Qtable_taxi = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, 
                    env, max_steps, Qtable_taxi)

In [51]:
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Qtable_taxi, eval_seed)
print(f'Mean reward: {mean_reward} +/- {std_reward}')

Mean reward: 7.56 +/- 2.706732347314747


In [52]:
Qtable_taxi

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 7.44059051,  8.525849  ,  7.44059051,  8.525849  ,  9.6220697 ,
        -0.474151  ],
       [11.84784175, 12.97761793, 11.84784175, 12.97761793, 14.11880599,
         3.97761793],
       ...,
       [11.08809195, 15.2715212 ,  9.86359857,  8.11668608,  1.21611816,
         1.41189811],
       [ 5.01936593, 10.72936333,  5.11763022,  6.69733458, -2.91758995,
        -3.92808987],
       [17.612     , 16.43588   , 17.612     , 18.8       ,  8.612     ,
         8.612     ]])

In [17]:
model = {
    'env_id' : env_id,
    'max_steps': max_steps,
    'n_training_episodes': n_training_episodes,
    'n_eval_episodes': n_eval_episodes,  
    'eval_seed': eval_seed,

    'learning_rate': learning_rate,
    'gamma': gamma,

    'epsilon': epsilon,
    'max_epsilon': max_epsilon,
    'min_epsilon': min_epsilon,
    'decay_rate': decay_rate,

    'qtable': Qtable_taxi
     
}

In [26]:
username = 'spacestar1705'
repo_name = 'q-Taxi-v3'
push_to_hub(repo_id = f'{username}/{repo_name}',
            model = model,
            env = env)

{'env_id': 'Taxi-v3', 'max_steps': 99, 'n_training_episodes': 100000, 'n_eval_episodes': 100, 'eval_seed': [16, 54, 165, 177, 191, 191, 120, 80, 149, 178, 48, 38, 6, 125, 174, 73, 50, 172, 100, 148, 146, 6, 25, 40, 68, 148, 49, 167, 9, 97, 164, 176, 61, 7, 54, 55, 161, 131, 184, 51, 170, 12, 120, 113, 95, 126, 51, 98, 36, 135, 54, 82, 45, 95, 89, 59, 95, 124, 9, 113, 58, 85, 51, 134, 121, 169, 105, 21, 30, 11, 50, 65, 12, 43, 82, 145, 152, 97, 106, 55, 31, 85, 38, 112, 102, 168, 123, 97, 21, 83, 158, 26, 80, 63, 5, 81, 32, 11, 28, 148], 'learning_rate': 0.05, 'gamma': 0.95, 'epsilon': 1.0, 'max_epsilon': 1.0, 'min_epsilon': 0.05, 'decay_rate': 0.005, 'qtable': array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 2.66427184,  3.77681601,  2.58437289,  3.74561817,  5.20997639,
        -5.03871182],
       [ 7.71223027,  8.99855974,  7.55555803,  9.19908648, 10.9512375 ,
         0.21305317],
       ...,
       [-0.87331685, 12.14374061, 

  self._proc.stdin.write(im.tostring())


Pushing repo q-Taxi-v3 to the Hugging Face Hub




Upload file replay.mp4:   3%|3         | 3.34k/106k [00:00<?, ?B/s]

Upload file q-learning.pkl:  14%|#3        | 3.34k/24.0k [00:00<?, ?B/s]

   b8964b1..24bf94d  main -> main



Your model is pushed to the hub. You can view your model here: https://huggingface.co/spacestar1705/q-Taxi-v3
