#Installs

In [43]:
!sudo add-apt-repository -y ppa:openjdk-r/ppa
!sudo apt-get purge openjdk-*
!sudo apt-get install openjdk-8-jdk
!sudo apt-get install xvfb
!sudo apt-get install xserver-xephyr
!sudo apt-get install -y python3-opengl
!sudo apt-get install ffmpeg
!pip3 install gym==0.17.3
!pip3 install minerl==0.4.4
!pip3 install pyvirtualdisplay
!pip3 install -U colabgymrender
!sudo apt-get install xvfb
!sudo apt-get install tigervnc-standalone-server
!pip3 install opencv-python
!pip3 install imageio==2.6.1
!pip3 install stable-baselines3~=0.10.0

PPA publishes dbgsym, you may need to include 'main/debug' component
Repository: 'deb https://ppa.launchpadcontent.net/openjdk-r/ppa/ubuntu/ jammy main'
More info: https://launchpad.net/~openjdk-r/+archive/ubuntu/ppa
Adding repository.
Found existing deb entry in /etc/apt/sources.list.d/openjdk-r-ubuntu-ppa-jammy.list
Adding deb entry to /etc/apt/sources.list.d/openjdk-r-ubuntu-ppa-jammy.list
Found existing deb-src entry in /etc/apt/sources.list.d/openjdk-r-ubuntu-ppa-jammy.list
Adding disabled deb-src entry to /etc/apt/sources.list.d/openjdk-r-ubuntu-ppa-jammy.list
Adding key to /etc/apt/trusted.gpg.d/openjdk-r-ubuntu-ppa.gpg with fingerprint DA1A4A13543B466853BAF164EB9B1D8886F44E2A
Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:4 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:5 https://cloud.r-project.org/bin/linux/ubuntu jammy-cr

#Setup

In [44]:
import os
import gym
import minerl
from tqdm.notebook import tqdm
from colabgymrender.recorder import Recorder
from pyvirtualdisplay import Display
import stable_baselines3
import numpy as np
import matplotlib.pyplot as plt

In [45]:
#Start the Display for saving videos on Colab
from pyvirtualdisplay import Display
display = Display(visible=False, size=(400, 300))
display.start();

In [46]:
#A wrapper for getting the POV of the avatar from the environment, which is needed for stable_baselines
class ExtractPOV(gym.ObservationWrapper):
  def __init__(self, env):
    super().__init__(env)
    self.observation_space = self.env.observation_space['pov']

  def observation(self, observation):
    return observation['pov']

In [47]:
class ReversibleActionWrapper(gym.ActionWrapper):
    def wrap_action(self, inner_action):
        if hasattr(self.env, 'wrap_action'):
            return self.reverse_action(self.env.wrap_action(inner_action))
        else:
            return self.reverse_action(inner_action)

    def reverse_action(self, action):
        raise NotImplementedError("In order to use a ReversibleActionWrapper, you need to implement a `reverse_action` function"
                                  "that is the inverse of the transformation performed on an action that comes into the wrapper")


class ActionShaping(ReversibleActionWrapper):
  def __init__(
      self,
      env: gym.Env,
      camera_angle: int = 10,
      always_attack: bool = True,
      camera_margin: int = 5
  ):

    super().__init__(env)

    self.camera_angle = camera_angle
    self.camera_margin = camera_margin
    self.always_attack = always_attack
    self._actions = [
        [('attack',1)],
        [('forward',1)],
        [('forward',1), ('jump',1)],
        [('camera', [-self.camera_angle,0])],
        [('camera', [self.camera_angle,0])],
        [('camera', [0, self.camera_angle])],
        [('camera', [0, -self.camera_angle])]
    ]

    self.actions = []
    for actions in self._actions:
      act = self.env.action_space.noop()
      for a, v in actions:
        act[a] = v
      if self.always_attack:
        act['attack'] = 1
      self.actions.append(act)

    self.action_space = gym.spaces.Discrete(len(self.actions) + 1)

  def action(self, action):
    if action == 7:
      return self.env.action_space.noop()
    else:
      return self.actions[action]

  def reverse_action(self, action: dict) -> np.ndarray:
    camera_actions = action["camera"].squeeze()
    attack_actions = action["attack"].squeeze()
    forward_actions = action["forward"].squeeze()
    jump_actions = action["jump"].squeeze()
    batch_size = len(camera_actions)
    actions = np.zeros((batch_size,), dtype=int)

    for i in range(len(camera_actions)):
      if camera_actions[i][0] < -self.camera_margin:
        actions[i] = 3
      elif camera_actions[i][0] > self.camera_margin:
        actions[i] = 4
      elif camera_actions[i][1] > self.camera_margin:
        actions[i] = 5
      elif camera_actions[i][1] < -self.camera_margin:
        actions[i] = 6
      elif forward_actions[i] == 1:
        if jump_actions[i] == 1:
          actions[i] = 2
        else:
          actions[i] = 1
      elif attack_actions[i] == 1:
        actions[i] = 0
      else:
        actions[i] = 7

    return actions

# Callback



In [48]:
from stable_baselines3.common import results_plotter
from stable_baselines3.common import monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy
from stable_baselines3.common.callbacks import BaseCallback

In [49]:
class SaveOnBestTrainingRewardCallback(BaseCallback):
  def __init__(self, check_freq: int, log_dir: str, verbose=1):
    super(SaveOnBestTrainingRewardCallback, self).__init__(verbose)
    self.check_freq = check_freq
    self.log_dir = log_dir
    self.save_path = os.path.join(log_dir, 'best_model')
    self.best_mean_reward = -np.inf

  def _init_callack(self) -> None:
    if self.save_path is not None:
      os.makedirs(self.save_path, exist_ok=True)

  def _on_step(self) -> bool:
    if self.n_calls % self.check_freq == 0:

      #Retrieve  Training Reward
      x, y = ts2xy(load_results(self.log_dir), 'timesteps')
      if len(x) > 0:
          #Mean training reward over the last 100 episodes
          mean_reward = np.mean(y[-100:])
          if self.verbose > 0:
            print("Num timesteps: {}".format(self.num_timesteps))
            print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward))

            #New best model, save the agent
            if mean_reward > self.best_mean_reward:
              self.best_mean_reward = mean_reward
              #Example for saving best model
              if self.verbose > 0:
                print("Saving new best model to {}".format(self.save_path))
              self.model.save(self.save_path)
    return True

#RL Training

In [50]:
from stable_baselines3 import PPO

environment = gym.make("MineRLTreechop-v0")
env1 = Recorder(environment, './video', fps=60) #Setting up the environment to be recorded, so that we have a nice little video output.

ERROR:minerl.env.malmo.instance.5a9899:[01:22:48] [main/ERROR]: The binary patch set is missing. Either you are in a development environment, or things are not going to work!
ERROR:minerl.env.malmo.instance.5a9899:[01:22:49] [main/ERROR]: FML appears to be missing any signature data. This is not a good thing
ERROR:minerl.env.malmo.instance.5a9899:[01:24:06] [Client thread/INFO]: [STDOUT]: [ERROR] Seed specified was NONE. Expected a long (integer).
ERROR:minerl.env.malmo.instance.5a9899:[01:24:07] [Thread-6/ERROR]: Error in class 'LibraryLWJGLOpenAL'
ERROR:minerl.env.malmo.instance.5a9899:[01:24:07] [Thread-6/ERROR]: Unable to initialize OpenAL.  Probable cause: OpenAL not supported.
ERROR:minerl.env.malmo.instance.5a9899:[01:24:07] [Thread-6/WARN]: ERROR MESSAGE:
ERROR:minerl.env.malmo.instance.5a9899:[01:24:07] [Sound Library Loader/WARN]: ERROR MESSAGE:
ERROR:minerl.env.malmo.instance.5a9899:[01:24:18] [Thread-10/ERROR]: Error in class 'LibraryLWJGLOpenAL'
ERROR:minerl.env.malmo.inst

In [51]:
"""
#callback addition
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)
monitored_env = monitor.Monitor(env1, log_dir)
callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir)
"""
obs_wrapped_treechop = ExtractPOV(env1) #Extracting the POV of the avatar from the environment which is needed for stable_baselines
obs_action_wrapped_treechop = ActionShaping(obs_wrapped_treechop) #Performing action shaping on the actions of the environment to convert them from dictionaries into an array.
obs = obs_action_wrapped_treechop.reset() #reseting the provided environnment



model = PPO(policy="CnnPolicy", env=obs_action_wrapped_treechop, verbose=1) #Setting the model to be a PPO model with a CnnPolicy. This was just the model used by tutorials, we'll experiment with the best model later
model.learn(total_timesteps=50000) #Training the model, allowing it to walk through 50000 timesteps of the environment (about 1.5 minutes)
#env1.release() #releasing the recorded environment to actually make a video on Colab.


ERROR:minerl.env.malmo.instance.5a9899:[01:26:29] [Client thread/INFO]: [STDOUT]: STATE ERROR - multiple states in the queue.
ERROR:minerl.env.malmo.instance.5a9899:[01:26:29] [Client thread/INFO]: [STDOUT]: STATE ERROR - multiple states in the queue.


Using cpu device
Wrapping the env in a DummyVecEnv.
Wrapping the env in a VecTransposeImage.
-----------------------------------------
| time/                   |             |
|    fps                  | 6           |
|    iterations           | 1           |
|    time_elapsed         | 311         |
|    total_timesteps      | 2048        |
| train/                  |             |
|    approx_kl            | 0.035318423 |
|    clip_fraction        | 0.339       |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.04       |
|    explained_variance   | 0.352       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.117      |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.0785     |
|    value_loss           | 0.00225     |
-----------------------------------------
------------------------------------------
| time/                   |              |
|    fps                  | 7            |
|    iterations       



-----------------------------------------
| time/                   |             |
|    fps                  | 7           |
|    iterations           | 4           |
|    time_elapsed         | 1133        |
|    total_timesteps      | 8192        |
| train/                  |             |
|    approx_kl            | 0.035145633 |
|    clip_fraction        | 0.37        |
|    clip_range           | 0.2         |
|    entropy_loss         | -2.03       |
|    explained_variance   | 0.221       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.113      |
|    n_updates            | 30          |
|    policy_gradient_loss | -0.0798     |
|    value_loss           | 0.00299     |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 7           |
|    iterations           | 5           |
|    time_elapsed         | 1377        |
|    total_timesteps      | 10240 



-----------------------------------------
| time/                   |             |
|    fps                  | 7           |
|    iterations           | 8           |
|    time_elapsed         | 2173        |
|    total_timesteps      | 16384       |
| train/                  |             |
|    approx_kl            | 0.062750615 |
|    clip_fraction        | 0.482       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.97       |
|    explained_variance   | -1.81       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.113      |
|    n_updates            | 70          |
|    policy_gradient_loss | -0.0768     |
|    value_loss           | 0.00329     |
-----------------------------------------
-----------------------------------------
| time/                   |             |
|    fps                  | 7           |
|    iterations           | 9           |
|    time_elapsed         | 2432        |
|    total_timesteps      | 18432 



----------------------------------------
| time/                   |            |
|    fps                  | 7          |
|    iterations           | 12         |
|    time_elapsed         | 3243       |
|    total_timesteps      | 24576      |
| train/                  |            |
|    approx_kl            | 0.13704538 |
|    clip_fraction        | 0.564      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.72      |
|    explained_variance   | -3.23      |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0715    |
|    n_updates            | 110        |
|    policy_gradient_loss | -0.0725    |
|    value_loss           | 0.00236    |
----------------------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 7          |
|    iterations           | 13         |
|    time_elapsed         | 3491       |
|    total_timesteps      | 26624      |
| train/        



-----------------------------------------
| time/                   |             |
|    fps                  | 7           |
|    iterations           | 16          |
|    time_elapsed         | 4225        |
|    total_timesteps      | 32768       |
| train/                  |             |
|    approx_kl            | 0.052652217 |
|    clip_fraction        | 0.497       |
|    clip_range           | 0.2         |
|    entropy_loss         | -1.93       |
|    explained_variance   | 0.138       |
|    learning_rate        | 0.0003      |
|    loss                 | -0.136      |
|    n_updates            | 150         |
|    policy_gradient_loss | -0.0952     |
|    value_loss           | 0.000362    |
-----------------------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 7          |
|    iterations           | 17         |
|    time_elapsed         | 4464       |
|    total_timesteps      | 34816      



----------------------------------------
| time/                   |            |
|    fps                  | 7          |
|    iterations           | 20         |
|    time_elapsed         | 5229       |
|    total_timesteps      | 40960      |
| train/                  |            |
|    approx_kl            | 0.10825312 |
|    clip_fraction        | 0.595      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.6       |
|    explained_variance   | 0.0855     |
|    learning_rate        | 0.0003     |
|    loss                 | -0.125     |
|    n_updates            | 190        |
|    policy_gradient_loss | -0.095     |
|    value_loss           | 0.000167   |
----------------------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 7          |
|    iterations           | 21         |
|    time_elapsed         | 5484       |
|    total_timesteps      | 43008      |
| train/        



----------------------------------------
| time/                   |            |
|    fps                  | 7          |
|    iterations           | 24         |
|    time_elapsed         | 6293       |
|    total_timesteps      | 49152      |
| train/                  |            |
|    approx_kl            | 0.06912307 |
|    clip_fraction        | 0.598      |
|    clip_range           | 0.2        |
|    entropy_loss         | -1.74      |
|    explained_variance   | 0.1        |
|    learning_rate        | 0.0003     |
|    loss                 | -0.0983    |
|    n_updates            | 230        |
|    policy_gradient_loss | -0.103     |
|    value_loss           | 0.000108   |
----------------------------------------
----------------------------------------
| time/                   |            |
|    fps                  | 7          |
|    iterations           | 25         |
|    time_elapsed         | 6531       |
|    total_timesteps      | 51200      |
| train/        

<stable_baselines3.ppo.ppo.PPO at 0x79703ad4df60>

#Plotting the Training

In [52]:
"""
results_plotter.plot_results([log_dir], 5000, results_plotter.X_TIMESTEPS, "MineRL RL Training")
plt.show()
"""
model.save("MineRL_RL_Model")

'\nresults_plotter.plot_results([log_dir], 5000, results_plotter.X_TIMESTEPS, "MineRL RL Training")\nplt.show()\n'

# RL Model Implementation

In [55]:
total_reward = 0
for episode in range(5):
  obs = obs_action_wrapped_treechop.reset() #reseting the provided environnment
  for i in range(5000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = obs_action_wrapped_treechop.step(action)
    total_reward += rewards


env1.release() #releasing the recorded environment to actually make a video on Colab.


KeyboardInterrupt: ignored