In [None]:
import ray
from ray.rllib import agents
from ray import tune
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.utils import try_import_tf
from gymnasium import spaces
import or_gym
from or_gym.utils import create_env
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Prepare Tensforflow and ray

In [None]:
# Establish Tensorflow API conncetion
tf_api, tf_original, tf_version = try_import_tf(error = True) 
# Disable callback synch on Windows
TUNE_DISABLE_AUTO_CALLBACK_SYNCER=1

# Knapsack environment with action masking

Class definition: customized Tensorflow-2-model for OR-Gym knapsack envrionemnt with action masking

In [None]:
class KP0ActionMaskModel(TFModelV2):
     
    def __init__(self, obs_space, action_space, num_outputs,
        model_config, name, true_obs_shape=(11,),
        action_embed_size=5, *args, **kwargs):
        
        # true_obs_shape is going to match the size of the state. 
        # If we stick with our reduced KP, that will be a vector with 11 entries. 
        # The other value we need to provide is the action_embed_size, which is going to be the size of our action space (5)
         
        super(KP0ActionMaskModel, self).__init__(obs_space,
            action_space, num_outputs, model_config, name, 
            *args, **kwargs)
         
        self.action_embed_model = FullyConnectedNetwork(
            spaces.Box(0, 1, shape=true_obs_shape), 
                action_space, action_embed_size,
            model_config, name + "_action_embedding")
        self.register_variables(self.action_embed_model.variables())
 
    def forward(self, input_dict, state, seq_lens):
        
        # The actual masking takes place in the forward method where we unpack the mask, actions, and state from 
        # the observation dictionary provided by our environment. The state yields our action embeddings which gets 
        # combined with our mask to provide logits with the smallest value we can provide. 
        # This will get passed to a softmax output which will reduce the probability of selecting these actions to 0, 
        # effectively blocking the agent from ever taking these illegal actions.
        
        avail_actions = input_dict["obs"]["avail_actions"]
        action_mask = input_dict["obs"]["action_mask"]
        action_embedding, _ = self.action_embed_model({
            "obs": input_dict["obs"]["state"]})
        intent_vector = tf_api.expand_dims(action_embedding, 1)
        action_logits = tf_api.reduce_sum(avail_actions * intent_vector, axis=1)
        inf_mask = tf_api.maximum(tf_api.log(action_mask), tf_api.float32.min)
        return action_logits + inf_mask, state
 
    def value_function(self):
        return self.action_embed_model.value_function()

Environment creation

In [None]:
# Configuration for gym environment
env_config = {'N': 5,
              'max_weight': 15,
              'item_weights': np.array([1, 12, 2, 1, 4]),
              'item_values': np.array([2, 4, 2, 1, 10]),
              'mask': True}
 
env_name = 'Knapsack-v0'
env = or_gym.make('Knapsack-v0', env_config=env_config)
 
print("Max weight capacity:\t{}kg".format(env.max_weight))
print("Number of items:\t{}".format(env.N))

Create Rllib trainable instance

In [None]:
# Register the model for Rllib usage
ModelCatalog.register_custom_model('kp_mask', KP0ActionMaskModel)
# Register the environment, so that we have a Trainable instance later
# ATTENTION: Tune needs the base class, not an instance of the environment like we get from or_gym.make(env_name) to work with. So we need to pass this to register_env using a lambda function as shown below.
env = create_env(env_name)
tune.register_env(env_name, lambda env_name: env(env_name, env_config=env_config))

trainer_config = {
    "model": {
        "custom_model": "kp_mask"   # Here we must use the custom model name taken in register process before
        },
    "env_config": env_config,       # env config from (or_)gym
    #"framework" : "tfe"             # tip by rllib to enable TensorFlow eager exection
     }

# ray.shutdown() maybe necessary in case of blocking instance
ray.init( ignore_reinit_error = True )
trainer = agents.ppo.PPOTrainer(env='Knapsack-v0', config=trainer_config)

Train the agent



In [None]:
env = trainer.env_creator('Knapsack-v0')
state = env.state

# Use the action masking to disable the agent to take specific actions, i.e. to avoid taking element in knapsack by index
# state['action_mask'][0] = 0

# Train an agent for 1000 states 
actions = np.array([trainer.compute_single_action(state) for i in range(10000)])

# If action masking used, check that this action was never taken
# print(any(actions==0))

# Tuning hyperparameters

In [None]:
# Use tune for hyperparameter tuning
tune_config = {
    'env': 'Knapsack-v0'
}
stop = {
    'timesteps_total': 10000
}
results = tune.run(
    'PPO', # Specify the algorithm to train
    config=tune_config,
    stop=stop
) 

Plot the results

In [None]:
colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
df = results.dataframe()
# Get column for total loss, policy loss, and value loss
tl_col = [i for i, j in enumerate(df.columns)
          if 'total_loss' in j][0]
pl_col = [i for i, j in enumerate(df.columns)
          if 'policy_loss' in j][0]
vl_col = [i for i, j in enumerate(df.columns)
          if 'vf_loss' in j][0]
labels = []
fig, ax = plt.subplots(2, 2, figsize=(15, 15), sharex=True)
for i, path in df['logdir'].iteritems():
    data = pd.read_csv(path + '/progress.csv')
    # Get labels for legend
    lr = data['experiment_id'][0]
    layers = data['training_iteration'][0]
    labels.append('LR={}; Shared Layers={}'.format(lr, layers))
     
    ax[0, 0].plot(data['timesteps_total'], 
            data['episode_reward_mean'], c=colors[i],
            label=labels[-1])
     
    ax[0, 1].plot(data['timesteps_total'], 
           data.iloc[:, tl_col], c=colors[i],
           label=labels[-1])
     
    ax[1, 0].plot(data['timesteps_total'], 
               data.iloc[:, pl_col], c=colors[i],
               label=labels[-1])
     
     
    ax[1, 1].plot(data['timesteps_total'], 
               data.iloc[:, vl_col], c=colors[i],
               label=labels[-1])
 
ax[0, 0].set_ylabel('Mean Rewards')
ax[0, 0].set_title('Training Rewards by Time Step')
ax[0, 0].legend(labels=labels, loc='upper center',
        ncol=3, bbox_to_anchor=[0.75, 1.2])
 
 
ax[0, 1].set_title('Total Loss by Time Step')
ax[0, 1].set_ylabel('Total Loss')
ax[0, 1].set_xlabel('Training Episodes')
 
ax[1, 0].set_title('Policy Loss by Time Step')
ax[1, 0].set_ylabel('Policy Loss')
ax[1, 0].set_xlabel('Time Step')
 
ax[1, 1].set_title('Value Loss by Time Step')
ax[1, 1].set_ylabel('Value Loss')
ax[1, 1].set_xlabel('Time Step')
 
plt.show()