In [1]:
#dependencies for the project
import gymnasium as gym
import gym_BinPack3D
from gym_BinPack3D.envs import Box, Rotate
import os

%matplotlib inline


In [2]:
#register the environment
gym.envs.register(
    id='BinPack3D-v1',
    entry_point='gym_BinPack3D.envs:PackingGame',
)

In [3]:
#define the environment.
#container_size: size of the container in 3D
#boxSeqGenerator: how the boxes are generated.
#enabled_rotations: which rotations are allowed for the boxes
#n_foreseeable_box: how many boxes are shown to the agent
#box_set: the set of boxes that are used in the environment. 

DATA_DIR = os.path.join(os.getcwd(), 'data')
LOG_DIR = os.path.join(os.getcwd(), 'log')
EVAL_LOG_DIR = os.path.join(os.getcwd(), 'eval_log')

In [4]:
import os

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt

from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.callbacks import BaseCallback


class SaveOnBestTrainingRewardCallback(BaseCallback):
    """
    Callback for saving a model (the check is done every ``check_freq`` steps)
    based on the training reward (in practice, we recommend using ``EvalCallback``).

    :param check_freq:
    :param log_dir: Path to the folder where the model will be saved.
      It must contains the file created by the ``Monitor`` wrapper.
    :param verbose: Verbosity level: 0 for no output, 1 for info messages, 2 for debug messages
    """
    def __init__(self, check_freq: int, log_dir: str, verbose: int = 1):
        super().__init__(verbose)
        self.check_freq = check_freq
        self.log_dir = log_dir
        self.save_path = os.path.join(log_dir, "best_model")
        self.best_mean_reward = -np.inf

    def _init_callback(self) -> None:
        # Create folder if needed
        if self.save_path is not None:
            os.makedirs(self.save_path, exist_ok=True)

    def _on_step(self) -> bool:
        if self.n_calls % self.check_freq == 0:

          # Retrieve training reward
          x, y = ts2xy(load_results(self.log_dir), "timesteps")
          if len(x) > 0:
              # Mean training reward over the last 100 episodes
              mean_reward = np.mean(y[-100:])
              if self.verbose >= 1:
                print(f"Num timesteps: {self.num_timesteps}")
                print(f"Best mean reward: {self.best_mean_reward:.2f} - Last mean reward per episode: {mean_reward:.2f}")

              # New best model, you could save the agent here
              if mean_reward > self.best_mean_reward:
                  self.best_mean_reward = mean_reward
                  # Example for saving best model
                  if self.verbose >= 1:
                    print(f"Saving new best model")
                  self.model.save(self.save_path)

        return True

In [5]:
import imageio

#run prediction of given @model and store it in a gif animation
def create_gif(model, vec_env, path, fps=5):
    frames = []
    obs = model.env.reset()
    frame = model.env.render(mode="rgb_array")

    for i in range(100):
        frames.append(frame)
        action, _state = model.predict(obs, deterministic=True)
        obs, reward, done, info = vec_env.step(action)
        frame = model.env.render(mode="rgb_array")
        if frame is None:
            print("Frame is None!!")
            break

    imageio.mimsave(path, frames, fps=fps)

In [6]:
#train with ppo
# import gymnasium as gym
import matplotlib.animation as animation
from matplotlib import pyplot as plt

import numpy as np

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.evaluation import evaluate_policy

from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.results_plotter import load_results, ts2xy, plot_results
from stable_baselines3.common.callbacks import EvalCallback

# Create log dirs if not yet exist
os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(EVAL_LOG_DIR, exist_ok=True)

env = gym.make('BinPack3D-v1', 
                container_size = (8, 4, 4),
                boxSeqGenerator='fixed', 
                #enabled_rotations = [Rotate.NOOP, Rotate.XY, Rotate.XZ, Rotate.YZ],
                enabled_rotations = [Rotate.NOOP],
                n_foreseeable_box = 3,
                box_set = [Box(2,1,1), Box(1,2,1)]
            )

env = Monitor(env, LOG_DIR)

vec_env = DummyVecEnv([lambda: env])
model = PPO("MultiInputPolicy", vec_env, verbose=1)

# Create the callback: check every 5000 steps
callback = SaveOnBestTrainingRewardCallback(check_freq=5000, log_dir=LOG_DIR)

# Create eval callback that evaluates agent for 5 episodes every 500 training environment steps.
eval_callback = EvalCallback(env, best_model_save_path=EVAL_LOG_DIR,
                              log_path=EVAL_LOG_DIR, eval_freq=1000,
                              deterministic=True,
                              render=False)

timesteps = 10_000
model.learn(total_timesteps=timesteps, callback=eval_callback)

#plot training process (uncomment below to see training for each timestamp)
#plot_results([LOG_DIR], timesteps, results_plotter.X_TIMESTEPS, "AirCargo")
#plt.show()

#save the model
model.save(DATA_DIR+"/ppo_model")

#laod model and create gif
PPO_GIF_DIR = DATA_DIR+"/ppo.gif"
ppo_model = PPO.load(DATA_DIR+"/ppo_model", env=vec_env)
create_gif(ppo_model, vec_env, PPO_GIF_DIR, fps=5)

using fixed box sequence
Box to be sampled:
Box: Size 2 1 1 Position 0 0 0
Box: Size 1 2 1 Position 0 0 0
Using cuda device


  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(
  logger.warn(f"{pre} is not within the observation space.")


Eval num_timesteps=1000, episode_reward=-100.00 +/- 0.00
Episode length: 2.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 2        |
|    mean_reward     | -100     |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
Eval num_timesteps=2000, episode_reward=-100.00 +/- 0.00
Episode length: 2.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 2        |
|    mean_reward     | -100     |
| time/              |          |
|    total_timesteps | 2000     |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 2        |
|    ep_rew_mean     | -100     |
| time/              |          |
|    fps             | 179      |
|    iterations      | 1        |
|    time_elapsed    | 11       |
|    total_timesteps | 2048     |
----------------------

In [8]:
#train agent using stable baselines3 A2C
from stable_baselines3 import A2C

env = gym.make('BinPack3D-v1', 
                container_size = (8, 4, 4),
                boxSeqGenerator='fixed', 
                enabled_rotations = [Rotate.NOOP],
                n_foreseeable_box = 1,
                box_set = [Box(2,1,1), Box(2,1,1), Box(2,1,1), Box(1,2,1), Box(1,2,1)]
            )
vec_env = DummyVecEnv([lambda: env])
# Create eval callback that evaluates agent for 5 episodes every 500 training environment steps.
eval_callback = EvalCallback(env, best_model_save_path=EVAL_LOG_DIR,
                              log_path=EVAL_LOG_DIR, eval_freq=1000, 
                              deterministic=True, render=False)

model = A2C("MultiInputPolicy", vec_env, verbose=1)
model.learn(total_timesteps=10_000, callback=eval_callback)

#save the model
model.save(DATA_DIR+"/a2c_model")

#run A2C prediction and create gif
A2C_GIF_DIR = DATA_DIR+"/a2c.gif"
a2c_model = A2C.load(DATA_DIR+"/a2c_model", env=vec_env)
create_gif(a2c_model, vec_env, A2C_GIF_DIR, fps=5)

using fixed box sequence
Box to be sampled:
Box: Size 2 1 1 Position 0 0 0
Box: Size 2 1 1 Position 0 0 0
Box: Size 2 1 1 Position 0 0 0
Box: Size 1 2 1 Position 0 0 0
Box: Size 1 2 1 Position 0 0 0
Using cuda device
------------------------------------
| time/                 |          |
|    fps                | 199      |
|    iterations         | 100      |
|    time_elapsed       | 2        |
|    total_timesteps    | 500      |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | -0.00945 |
|    learning_rate      | 0.0007   |
|    n_updates          | 99       |
|    policy_loss        | -0       |
|    value_loss         | 7.04e+03 |
------------------------------------
Eval num_timesteps=1000, episode_reward=-100.00 +/- 0.00
Episode length: 2.40 +/- 0.49
------------------------------------
| eval/                 |          |
|    mean_ep_length     | 2.4      |
|    mean_reward        | -100     |
| time/                 |     



------------------------------------
| time/                 |          |
|    fps                | 199      |
|    iterations         | 300      |
|    time_elapsed       | 7        |
|    total_timesteps    | 1500     |
| train/                |          |
|    entropy_loss       | -0       |
|    explained_variance | -0.00134 |
|    learning_rate      | 0.0007   |
|    n_updates          | 299      |
|    policy_loss        | -0       |
|    value_loss         | 5.45e+03 |
------------------------------------
Eval num_timesteps=2000, episode_reward=-100.00 +/- 0.00
Episode length: 2.80 +/- 0.75
-------------------------------------
| eval/                 |           |
|    mean_ep_length     | 2.8       |
|    mean_reward        | -100      |
| time/                 |           |
|    total_timesteps    | 2000      |
| train/                |           |
|    entropy_loss       | -0        |
|    explained_variance | -0.000694 |
|    learning_rate      | 0.0007    |
|    n_updates 