In [None]:
!pip install gymnasium
!pip install huggingface_hub
!pip install pygame
!pip install numpy

!pip install dill
!pip install pyyaml==6.0
!pip install imageio
!pip install imageio_ffmpeg
!pip install pyglet==1.5.1
!pip install tqdm

In [None]:
!sudo apt-get update
!sudo apt-get install -y python3-opengl
!apt install ffmpeg xvfb
!pip3 install pyvirtualdisplay

In [None]:
import os
os.kill(os.getpid(), 9)

In [None]:
# Virtual display
from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(400, 300))
virtual_display.start()

In [None]:
from __future__ import annotations

from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import random
import gymnasium as gym
import imageio
import os
import tqdm
import dill as pickle

from tqdm.notebook import tqdm

In [None]:
env = gym.make("Blackjack-v1", sab=True, render_mode="rgb_array")

In [None]:
def initialize_q_values(env):
  return defaultdict(lambda: np.zeros(env.action_space.n))

In [None]:
q_values = initialize_q_values(env)

In [None]:
def greedy_policy(q_values, state):
  return int(np.argmax(q_values[state]))

In [None]:
def epilson_greedy_policy(q_values, state, epsilon):
  random_num = np.random.random()
  if random_num < epsilon:
    return env.action_space.sample()
  # with probability (1 - epsilon) act greedily (exploit)
  else:
    return greedy_policy(q_values, state)

In [None]:
# Define hyperparameters

# Training parameters
n_training_episodes = 500_000
learning_rate = 0.001
discount_factor = 0.95

# Exploration parameters
max_epsilon = 1.0
min_epsilon = 0.05
decay_rate = 0.0001

In [None]:
def train(n_training_episodes, min_epsilon, max_epsion, decay_rate, q_values):
  for episode in tqdm(range(n_training_episodes)):
    epsilon = max_epsilon
    state, info = env.reset()
    done = False

    # play one episode
    while not done:
        action = epilson_greedy_policy(q_values, state, epsilon)
        next_state, reward, terminated, truncated, info = env.step(action)

        future_q_value = (not terminated) * np.max(q_values[next_state])
        temporal_difference = (
            reward + discount_factor * future_q_value - q_values[state][action]
        )

        q_values[state][action] = (
            q_values[state][action] + learning_rate * temporal_difference
        )

        # update if the environment is done and the current obs
        done = terminated or truncated
        state = next_state

    epsilon = max(min_epsilon, epsilon - decay_rate)
  return q_values

In [None]:
q_values = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, q_values)

In [None]:
def evaluate_agent(env, n_eval_episodes, q_values):
  """
  Evaluate the agent for ``n_eval_episodes`` episodes and returns average reward and std of reward.
  :param env: The evaluation environment
  :param n_eval_episodes: Number of episode to evaluate the agent
  :param Q: The q values
  """
  episode_rewards = []
  for episode in tqdm(range(n_eval_episodes)):
    state, info = env.reset()
    step = 0
    truncated = False
    terminated = False
    total_rewards_ep = 0

    done = False
    # play one episode
    while not done:
      # Take the action (index) that have the maximum expected future reward given that state
      action = greedy_policy(q_values, state)
      new_state, reward, terminated, truncated, info = env.step(action)
      total_rewards_ep += reward

      if terminated or truncated:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

In [None]:
# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

mean_reward, std_reward = evaluate_agent(env, n_eval_episodes, q_values)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

In [None]:
def record_video(env, q_values, out_directory, fps=1):
    """
    Generate a replay video of the agent
    :param env
    :param q_values: q_values of our agent
    :param out_directory
    :param fps: how many frame per seconds (with taxi-v3 and frozenlake-v1 we use 1)
    """
    images = []
    terminated = False
    truncated = False
    state, info = env.reset(seed=random.randint(0, 500))
    img = env.render()
    images.append(img)
    for step in range(20):
        # Take the action (index) that have the maximum expected future reward given that state
        action = greedy_policy(q_values, state)
        state, reward, terminated, truncated, info = env.step(action)  # We directly put next_state = state for recording logic
        img = env.render()
        images.append(img)
        if terminated or truncated:
          state, info = env.reset(seed=random.randint(0, 500))
    imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [None]:
from huggingface_hub import HfApi, snapshot_download
from huggingface_hub.repocard import metadata_eval_result, metadata_save

from pathlib import Path
import datetime
import json

In [None]:
def push_to_hub(
    repo_id, model, env, video_fps=1, local_repo_path="hub"
):
    """
    Evaluate, Generate a video and Upload a model to Hugging Face Hub.
    This method does the complete pipeline:
    - It evaluates the model
    - It generates the model card
    - It generates a replay video of the agent
    - It pushes everything to the Hub

    :param repo_id: repo_id: id of the model repository from the Hugging Face Hub
    :param env
    :param video_fps: how many frame per seconds to record our video replay
    (with taxi-v3 and frozenlake-v1 we use 1)
    :param local_repo_path: where the local repository is
    """
    _, repo_name = repo_id.split("/")

    eval_env = env
    api = HfApi()

    # Step 1: Create the repo
    repo_url = api.create_repo(
        repo_id=repo_id,
        exist_ok=True,
    )

    # Step 2: Download files
    repo_local_path = Path(snapshot_download(repo_id=repo_id))

    # Step 3: Save the model
    if env.spec.kwargs.get("map_name"):
        model["map_name"] = env.spec.kwargs.get("map_name")

    # Pickle the model
    with open((repo_local_path) / "q-learning.pkl", "wb") as f:
        pickle.dump(model, f)

    # Step 4: Evaluate the model and build JSON with evaluation metrics
    mean_reward, std_reward = evaluate_agent(
        eval_env, model["n_eval_episodes"], q_values
    )

    evaluate_data = {
        "env_id": model["env_id"],
        "mean_reward": mean_reward,
        "n_eval_episodes": model["n_eval_episodes"],
        "eval_datetime": datetime.datetime.now().isoformat()
    }

    # Write a JSON file called "results.json" that will contain the
    # evaluation results
    with open(repo_local_path / "results.json", "w") as outfile:
        json.dump(evaluate_data, outfile)

    # Step 5: Create the model card
    env_name = model["env_id"]
    if env.spec.kwargs.get("map_name"):
        env_name += "-" + env.spec.kwargs.get("map_name")

    metadata = {}
    metadata["tags"] = [env_name, "q-learning", "reinforcement-learning", "custom-implementation"]

    # Add metrics
    eval = metadata_eval_result(
        model_pretty_name=repo_name,
        task_pretty_name="reinforcement-learning",
        task_id="reinforcement-learning",
        metrics_pretty_name="mean_reward",
        metrics_id="mean_reward",
        metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
        dataset_pretty_name=env_name,
        dataset_id=env_name,
    )
    env_id = model["env_id"]

    # Merges both dictionaries
    metadata = {**metadata, **eval}

    model_card = f"""
  # **Q-Learning** Agent playing1 **{env_id}**
  This is a trained model of a **Q-Learning** agent playing **{env_id}** .

  ## Usage

  ```python

  model = load_from_hub(repo_id="{repo_id}", filename="q-learning.pkl")

  # Don't forget to check if you need to add additional attributes
  env = gym.make(model["env_id"])
  ```
  """

    evaluate_agent(env, model["n_eval_episodes"], model["q_values"])

    readme_path = repo_local_path / "README.md"
    readme = ""
    print(readme_path.exists())
    if readme_path.exists():
        with readme_path.open("r", encoding="utf8") as f:
            readme = f.read()
    else:
        readme = model_card

    with readme_path.open("w", encoding="utf-8") as f:
        f.write(readme)

    # Save our metrics to Readme metadata
    metadata_save(readme_path, metadata)

    # Step 6: Record a video
    video_path = repo_local_path / "replay.mp4"
    record_video(env, model["q_values"], video_path, video_fps)

    # Step 7. Push everything to the Hub
    api.upload_folder(
        repo_id=repo_id,
        folder_path=repo_local_path,
        path_in_repo=".",
    )

    print("Your model is pushed to the Hub. You can view your model here: ", repo_url)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
model = {
    "env_id": "BlackJack-v1",
    "n_training_episodes": n_training_episodes,
    "n_eval_episodes": n_eval_episodes,
    "learning_rate": learning_rate,
    "discount_factor": discount_factor,
    "q_values": q_values
}

In [None]:
username = "nzdb70" # FILL THIS
repo_name = "BlackJack-v1"
push_to_hub(
    repo_id=f"{username}/{repo_name}",
    model=model,
    env=env)
env.close()