In [None]:
import gymnasium as gym
import or_gym
import numpy as np
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense, Flatten, Reshape
from tensorflow.keras.optimizers import Adam
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

# Binary Knapsack

In [None]:
env_config = {'N': 5,
              'max_weight': 15,
              'item_weights': np.array([1, 12, 2, 1, 4]),
              'item_values': np.array([2, 4, 2, 1, 10]),
              'mask': False}
env = or_gym.make('Knapsack-v0', env_config=env_config)  
initial_state = env.reset()

The state variable must be read as the following:

    Observation:
        Type: Tuple, Discrete
        0: list of item weights
        1: list of item values
        2: maximum weight of the knapsack
        3: current weight in knapsack

    Actions:
        Type: Discrete
        0: Place item 0 into knapsack
        1: Place item 1 into knapsack
        2: ...

    Reward:
        Value of item successfully placed into knapsack or 0 if the item
        doesn't fit, at which point the episode ends.

    Starting State:
        Lists of available items and empty knapsack.

    Episode Termination:
        Full knapsack or selection that puts the knapsack over the limit.

In [None]:
actions = env.action_space.n
states = env.observation_space
states.shape

Simulate random item selection for 10 episodes

In [None]:
env.reset()
episode = 0
done = False
while not done :
    episode += 1
    print("Episode: " + str(episode))
    action = np.random.randint(actions)
    print("Take element number: " + str(action))
    next_state, reward, done, info = env.step(action)
    print("Reward: " + str(reward))
    print(next_state)
    print(env.render())

As we can see in the detailed print out of the observation space, it is just the last index value which changes from episode to episode. This index is equal to the current total weight of the knapsack. The observation space from the environment gives no indication on the total value collected, which is instead added by the render() function.

# Keras model for the knapsack decision environment


In [None]:
model = Sequential()  
model.add(Dense(24, activation='relu', input_shape=states.shape))
model.add(Flatten())
model.add(Dense(actions, activation='linear'))
model.summary()

In [None]:
model.layers[0].get_input_shape_at(0) # get the input shape of desired layer

# Agent training with Keras RL

In [None]:
policy = BoltzmannQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=50000, visualize=False, verbose=1)