In [1]:
### REINFORCEMENT LEARNING I ###
### TRAIN, SAVE, EVALUATE MODEL ###

import gym
import stable_baselines3 as sb
from stable_baselines3.common.callbacks import EvalCallback

total_timesteps = 1e2
env = gym.make('Steel-v0')
# Callback for best model
best_callback = EvalCallback(env, best_model_save_path='./steel/callback/model',
                        log_path='./steel/callback/model', eval_freq=1000,
                        deterministic=True, render=False)
model = sb.PPO('MlpPolicy', env, tensorboard_log="./steel/tensorboard/")
model.learn(total_timesteps=total_timesteps, tb_log_name='model', callback = best_callback)

Eval num_timesteps=1000, episode_reward=57.80 +/- 15.34
Episode length: 100.00 +/- 0.00
New best mean reward!
Eval num_timesteps=2000, episode_reward=55.20 +/- 20.86
Episode length: 100.00 +/- 0.00


<stable_baselines3.ppo.ppo.PPO at 0x1e020102190>

In [1]:
### REINFORCEMENT LEARNING II ###
### TRY AND EVALUATE MY MODEL ###
import pandas as pd
import stable_baselines3 as sb
import gym
import numpy as np

env = gym.make('Steel-v0')
model = sb.PPO.load('./steel/callback/model/best_model', env = env)
# Initilaize Reward
result_df = pd.DataFrame(np.nan, index=range(0,100), columns=['RM', 'PM', 'Quality', 'Reward'])
# Set iterations
iterations = 1
for i in range(iterations):
    # Initialize episode
    store = []
    obs = env.reset()
    done = False
    store.append([0, obs[0], env.breakdown, obs[1], 0, done])
    # Compute one episode
    while not done:
        # Get best action for state
        action, _state = model.predict(obs, deterministic=True)
        # Compute next state
        obs, reward, done, info = env.step(action)
        # Store results of this episode
        store.append([action, obs[0], env.breakdown, obs[1], reward, done])
    eps_df = pd.DataFrame(store, columns=['action', 'health', 'breakdown', 'quality', 'reward', 'done'])
    # Calculate nr. of reactive maintenance interventions by counting health 'resets' and substracting PM actions
    result_df.iloc[i]['RM'] = sum(eps_df['breakdown']==True)
    # Calculate nr. of preventive maintenance interventions
    result_df.iloc[i]['PM'] = sum(eps_df['action']==0)
    # Calculate quality
    result_df.iloc[i]['Quality'] = sum(eps_df['quality'])
    # Calculate reward
    result_df.iloc[i]['Reward'] = sum(eps_df['reward'])


print("The average number of reactive maintenance interventions per episode is: ", result_df['RM'].mean())
print("The average number of preventive maintenance interventions per episode is: ", result_df['PM'].mean())
print("The average sum of non-faulty bars per episode is: ", result_df['Quality'].mean())
print("The average reward per episode is: ", result_df['Reward'].mean())


The average number of reactive maintenance interventions per episode is:  4.0
The average number of preventive maintenance interventions per episode is:  1.0
The average sum of non-faulty bars per episode is:  57.0
The average reward per episode is:  -24.0
