In [1]:
### TRAIN, SAVE, EVALUATE MODEL ###

import gym
import stable_baselines3 as sb
from stable_baselines3.common.callbacks import EvalCallback
from stable_baselines3.common.evaluation import evaluate_policy

env = gym.make('Production-v0')
# Callback for best model
best_callback = EvalCallback(env, best_model_save_path='./callback/',
                             log_path='./callback/', eval_freq=1000,
                             deterministic=True, render=False)

model = sb.DQN('MlpPolicy', env, tensorboard_log="./tensorboard/", gamma = 0.99, learning_rate=0.01)
model.learn(total_timesteps=1e6, tb_log_name="DQN", callback = best_callback)
model.save("DQN_1_model")

# Evaluate the agent
mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to ./tensorboard/DQN_3
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 100       |
|    ep_rew_mean      | -1.59e+03 |
|    exploration_rate | 0.62      |
| time/               |           |
|    episodes         | 4         |
|    fps              | 2691      |
|    time_elapsed     | 0         |
|    total_timesteps  | 400       |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 100       |
|    ep_rew_mean      | -1.73e+03 |
|    exploration_rate | 0.24      |
| time/               |           |
|    episodes         | 8         |
|    fps              | 2728      |
|    time_elapsed     | 0         |
|    total_timesteps  | 800       |
-----------------------------------




Eval num_timesteps=1000, episode_reward=-1186.20 +/- 139.08
Episode length: 100.00 +/- 0.00
-----------------------------------
| eval/               |           |
|    mean_ep_length   | 100       |
|    mean_reward      | -1.19e+03 |
| rollout/            |           |
|    exploration_rate | 0.051     |
| time/               |           |
|    total_timesteps  | 1000      |
-----------------------------------
New best mean reward!
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 100       |
|    ep_rew_mean      | -1.82e+03 |
|    exploration_rate | 0.05      |
| time/               |           |
|    episodes         | 12        |
|    fps              | 1874      |
|    time_elapsed     | 0         |
|    total_timesteps  | 1200      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 100       |
|    ep_rew_mean      | -1.85e+03 |
|    exploration_rate 

In [2]:
### LOAD MODEL ###
import gym
import stable_baselines3 as sb
from stable_baselines3 import DQN

env = gym.make('Production-v0')
# Best Model
model = DQN.load('./callback/best_model', env = env)
# Last Model
#model = DQN.load('DQN_1_model', env = env)

# Evaluate the agent
evaluate_policy(model, model.get_env(), n_eval_episodes=10)

Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


(-1245.3, 53.79042665753824)

In [3]:
### TRY MODEL ###
import pandas as pd

store = []
obs = env.reset()
done = False
store.append([0, obs[0], obs[2], 0, done, obs[1]])
while not done:
    action, _state = model.predict(obs, deterministic=True)
    obs, reward, done, info = env.step(action)
    store.append([action, obs[0], obs[2], reward, done, obs[1]])

store_df = pd.DataFrame(store, columns=['action', 'RUL', 'inventory', 'reward', 'done', 'next_order'])
print("Cumulative reward is: ", sum(store_df['reward']))

Cumulative reward is:  -1232
