#First trial at Q-Learning on the Frozen Lake Environment


In [1]:
%%capture
!pip install pyglet==1.5.1 
!apt install python-opengl
!apt install ffmpeg
!apt install xvfb
!pip3 install pyvirtualdisplay

from pyvirtualdisplay import Display

virtual_display = Display(visible=0, size=(1400, 900))
virtual_display.start()

In [2]:
%%capture
!pip install gym==0.24 
!pip install pygame
!pip install numpy

!pip install huggingface_hub
!pip install pickle5
!pip install pyyaml==6.0 
!pip install imageio imageio_ffmpeg

In [3]:
import numpy as np
import gym
import random
import imageio
import os

import pickle5 as pickle

In [4]:
env = gym.make("FrozenLake-v1", map_name="4x4", is_slippery=True)

In [5]:
env.reset()
print("_____OBSERVATION SPACE_____ \n")
print("Observation Space", env.observation_space)
print("Sample observation", env.observation_space.sample()) 

_____OBSERVATION SPACE_____ 

Observation Space Discrete(16)
Sample observation 11


In [6]:
print("\n _____ACTION SPACE_____ \n")
print("Action Space Shape", env.action_space.n)
print("Action Space Sample", env.action_space.sample()) 


 _____ACTION SPACE_____ 

Action Space Shape 4
Action Space Sample 1


In [7]:
state_space = env.observation_space.n
print("There are ", state_space, " possible states")

action_space = env.action_space.n
print("There are ", action_space, " possible actions")

There are  16  possible states
There are  4  possible actions


In [8]:
def initialize_q_table(state_space, action_space):
  Qtable = np.zeros((state_space, action_space))
  return Qtable

In [9]:
Qtable_frozenlake = initialize_q_table(state_space, action_space)

In [10]:
def epsilon_greedy_policy(Qtable, state, epsilon):
  
  random_int = random.uniform(0,1)
  if random_int > epsilon:
    action = np.argmax(Qtable[state])
  else:
    action = env.action_space.sample()
  
  return action

In [11]:
def greedy_policy(Qtable, state):
  
  action = np.argmax(Qtable[state])
  
  return action

In [12]:
n_training_episodes = 10000  
learning_rate = 0.7         


n_eval_episodes = 100        


env_id = "FrozenLake-v1"     
max_steps = 99               
gamma = 0.95                 
eval_seed = []               


epsilon = 1.0                 
max_epsilon = 1.0             
min_epsilon = 0.05             
decay_rate = 0.0005            

In [13]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
  for episode in range(n_training_episodes):
    
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    state = env.reset()
    step = 0
    done = False

    for step in range(max_steps):
      action = epsilon_greedy_policy(Qtable, state, epsilon) 
      new_state, reward, done, info = env.step(action)
      Qtable[state][action] = Qtable[state][action] + learning_rate * (reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action])   
      if done:
        break
      state = new_state
  return Qtable

In [14]:
Qtable_frozenlake = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable_frozenlake)

In [15]:
Qtable_frozenlake

array([[0.2054428 , 0.18066286, 0.09083494, 0.09544708],
       [0.01026733, 0.03405225, 0.03828749, 0.17480514],
       [0.02725234, 0.03401946, 0.03285954, 0.03826465],
       [0.00476392, 0.01151476, 0.02072448, 0.03553287],
       [0.20685733, 0.20099057, 0.06459205, 0.19402923],
       [0.        , 0.        , 0.        , 0.        ],
       [0.00283237, 0.00289482, 0.00426329, 0.00255693],
       [0.        , 0.        , 0.        , 0.        ],
       [0.06973753, 0.17168295, 0.11053723, 0.24111151],
       [0.02176616, 0.60530222, 0.07435009, 0.04316953],
       [0.66724786, 0.02494016, 0.01237747, 0.01727673],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.00310105, 0.18592518, 0.90526201, 0.01750653],
       [0.48357099, 0.99345543, 0.53629958, 0.78851211],
       [0.        , 0.        , 0.        , 0.        ]])

In [16]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
  
  episode_rewards = []
  for episode in range(n_eval_episodes):
    if seed:
      state = env.reset(seed=seed[episode])
    else:
      state = env.reset()
    step = 0
    done = False
    total_rewards_ep = 0
    
    for step in range(max_steps):
      action = np.argmax(Q[state][:])
      new_state, reward, done, info = env.step(action)
      total_rewards_ep += reward
        
      if done:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

In [17]:
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Qtable_frozenlake, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

Mean_reward=0.74 +/- 0.44


In [48]:
%%capture
from huggingface_hub import HfApi, HfFolder, Repository
from huggingface_hub.repocard import metadata_eval_result, metadata_save

from pathlib import Path
import datetime
import json

In [49]:
def record_video(env, Qtable, out_directory, fps=1):
  images = []  
  done = False
  state = env.reset(seed=random.randint(0,500))
  img = env.render(mode='rgb_array')
  images.append(img)
  while not done:
    action = np.argmax(Qtable[state][:])
    state, reward, done, info = env.step(action) 
    img = env.render(mode='rgb_array')
    images.append(img)
  imageio.mimsave(out_directory, [np.array(img) for i, img in enumerate(images)], fps=fps)

In [50]:
def push_to_hub(repo_id, 
                model,
                env,
                video_fps=1,
                local_repo_path="hub",
                commit_message="Push Q-Learning agent to Hub",
                token= None
                ):
  _, repo_name = repo_id.split("/")

  eval_env = env
  api = HfApi()
  
  repo_url = api.create_repo(
        repo_id=repo_id,
        token=token,
        private=False,
        exist_ok=True,)
  
 
  repo_local_path = Path(local_repo_path) / repo_name
  repo = Repository(repo_local_path, clone_from=repo_url, use_auth_token=True)
  repo.git_pull()
  
  repo.lfs_track(["*.mp4"])

  
  if env.spec.kwargs.get("map_name"):
    model["map_name"] = env.spec.kwargs.get("map_name")
    if env.spec.kwargs.get("is_slippery", "") == False:
      model["slippery"] = False

  print(model)
  
    
 
  with open(Path(repo_local_path)/'q-learning.pkl', 'wb') as f:
    pickle.dump(model, f)
  
  
  mean_reward, std_reward = evaluate_agent(eval_env, model["max_steps"], model["n_eval_episodes"], model["qtable"], model["eval_seed"])

  
  eval_datetime = datetime.datetime.now()
  eval_form_datetime = eval_datetime.isoformat()

  evaluate_data = {
        "env_id": model["env_id"], 
        "mean_reward": mean_reward,
        "n_eval_episodes": model["n_eval_episodes"],
        "eval_datetime": eval_form_datetime,
  }
  
  with open(Path(repo_local_path) / "results.json", "w") as outfile:
      json.dump(evaluate_data, outfile)

 
  env_name = model["env_id"]
  if env.spec.kwargs.get("map_name"):
    env_name += "-" + env.spec.kwargs.get("map_name")

  if env.spec.kwargs.get("is_slippery", "") == False:
    env_name += "-" + "no_slippery"

  metadata = {}
  metadata["tags"] = [
        env_name,
        "q-learning",
        "reinforcement-learning",
        "custom-implementation"
    ]

 
  eval = metadata_eval_result(
      model_pretty_name=repo_name,
      task_pretty_name="reinforcement-learning",
      task_id="reinforcement-learning",
      metrics_pretty_name="mean_reward",
      metrics_id="mean_reward",
      metrics_value=f"{mean_reward:.2f} +/- {std_reward:.2f}",
      dataset_pretty_name=env_name,
      dataset_id=env_name,
    )

 
  metadata = {**metadata, **eval}

  model_card = f"""
  # **Q-Learning** Agent playing **{env_id}**
  This is a trained model of a **Q-Learning** agent playing **{env_id}** .
  """

  model_card += """
  ## Usage
  ```python
  """

  model_card += f"""model = load_from_hub(repo_id="{repo_id}", filename="q-learning.pkl")

 
  env = gym.make(model["env_id"])

  evaluate_agent(env, model["max_steps"], model["n_eval_episodes"], model["qtable"], model["eval_seed"])
  """

  model_card +="""
  ```
  """

  readme_path = repo_local_path / "README.md"
  readme = ""
  if readme_path.exists():
      with readme_path.open("r", encoding="utf8") as f:
        readme = f.read()
  else:
    readme = model_card

  with readme_path.open("w", encoding="utf-8") as f:
    f.write(readme)
  metadata_save(readme_path, metadata)
  video_path =  repo_local_path / "replay.mp4"
  record_video(env, model["qtable"], video_path, video_fps)
  print(f"Pushing repo {repo_name} to the Hugging Face Hub")
  repo.push_to_hub(commit_message=commit_message)
  print(f"The is pushed to the hub. It can viewed here: {repo_url}")

In [53]:
from huggingface_hub import notebook_login
notebook_login()

Login successful
Your token has been saved to /root/.huggingface/token
[1m[31mAuthenticated through git-credential store but this isn't the helper defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub. Run the following command in your terminal in case you want to set this credential helper as the default

git config --global credential.helper store[0m


In [54]:
model = {
    "env_id": env_id,
    "max_steps": max_steps,
    "n_training_episodes": n_training_episodes,
    "n_eval_episodes": n_eval_episodes,
    "eval_seed": eval_seed,

    "learning_rate": learning_rate,
    "gamma": gamma,

    "epsilon": epsilon,
    "max_epsilon": max_epsilon,
    "min_epsilon": min_epsilon,
    "decay_rate": decay_rate,

    "qtable": Qtable_frozenlake
}

In [55]:
model

{'decay_rate': 0.0005,
 'env_id': 'FrozenLake-v1',
 'epsilon': 1.0,
 'eval_seed': [],
 'gamma': 0.95,
 'learning_rate': 0.7,
 'max_epsilon': 1.0,
 'max_steps': 99,
 'min_epsilon': 0.05,
 'n_eval_episodes': 100,
 'n_training_episodes': 10000,
 'qtable': array([[0.2054428 , 0.18066286, 0.09083494, 0.09544708],
        [0.01026733, 0.03405225, 0.03828749, 0.17480514],
        [0.02725234, 0.03401946, 0.03285954, 0.03826465],
        [0.00476392, 0.01151476, 0.02072448, 0.03553287],
        [0.20685733, 0.20099057, 0.06459205, 0.19402923],
        [0.        , 0.        , 0.        , 0.        ],
        [0.00283237, 0.00289482, 0.00426329, 0.00255693],
        [0.        , 0.        , 0.        , 0.        ],
        [0.06973753, 0.17168295, 0.11053723, 0.24111151],
        [0.02176616, 0.60530222, 0.07435009, 0.04316953],
        [0.66724786, 0.02494016, 0.01237747, 0.01727673],
        [0.        , 0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        , 0.     

In [56]:
username = "iiShreya" 
repo_name = "frozenLake_4x4_nonSlippery"
push_to_hub(
    repo_id=f"{username}/{repo_name}",
    model=model,
    env=env)

{'env_id': 'FrozenLake-v1', 'max_steps': 99, 'n_training_episodes': 10000, 'n_eval_episodes': 100, 'eval_seed': [], 'learning_rate': 0.7, 'gamma': 0.95, 'epsilon': 1.0, 'max_epsilon': 1.0, 'min_epsilon': 0.05, 'decay_rate': 0.0005, 'qtable': array([[0.2054428 , 0.18066286, 0.09083494, 0.09544708],
       [0.01026733, 0.03405225, 0.03828749, 0.17480514],
       [0.02725234, 0.03401946, 0.03285954, 0.03826465],
       [0.00476392, 0.01151476, 0.02072448, 0.03553287],
       [0.20685733, 0.20099057, 0.06459205, 0.19402923],
       [0.        , 0.        , 0.        , 0.        ],
       [0.00283237, 0.00289482, 0.00426329, 0.00255693],
       [0.        , 0.        , 0.        , 0.        ],
       [0.06973753, 0.17168295, 0.11053723, 0.24111151],
       [0.02176616, 0.60530222, 0.07435009, 0.04316953],
       [0.66724786, 0.02494016, 0.01237747, 0.01727673],
       [0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        ],
       [0.0031010

  self._proc.stdin.write(im.tostring())


Upload file replay.mp4:  11%|#         | 3.34k/31.7k [00:00<?, ?B/s]

The is pushed to the hub. It can viewed here: https://huggingface.co/iiShreya/frozenLake_4x4_nonSlippery
