In [18]:
import numpy as np
from sb3_contrib import MaskablePPO
from simulator import Simulator
from gym_env import Environment
from heuristics import random_policy
import torch

config_type = "high_utilization"  # Change this to "low_utilization" or "high_utilization" as needed

# Load the trained PPO model
model_path = f"models/PPO/{config_type}/{config_type}_best.zip"
model = MaskablePPO.load(model_path)

# Create the environment (matching the training config)
simulator = Simulator(config_type=config_type, nr_cases=1000)
env = Environment(simulator)

random_states = []

step_count = 0
while len(random_states) < 100:
    done = False
    obs = env.reset()
    while not done:
        action = random_policy(simulator)
        obs, reward, done, _, _ = env.step(action)
        step_count += 1
        if step_count % 100 == 0:
            random_states.append(obs.copy())

print(random_states)


actions = {0: "spt_policy",
          1: "fifo_policy",
          2: "shortest_queue_policy",
          3: "longest_queue_policy",
          4: "postpone"}
track_actions = {0:0,
                  1:0,
                  2:0,
                  3:0,
                  4:0}

# Predict actions and print probabilities for each random state
for i, obs in enumerate(random_states):
    # Convert observation to the correct shape and type
    obs_tensor = torch.as_tensor(obs, dtype=torch.float32).unsqueeze(0)
    # Get action mask for this state
    mask = env.action_masks()
    mask_tensor = torch.as_tensor(mask, dtype=torch.bool).unsqueeze(0)
    # Get action probabilities from the policy
    with torch.no_grad():
        dist = model.policy.get_distribution(obs_tensor)
        probs = dist.distribution.probs.cpu().numpy().flatten()
    action = np.argmax(probs)
    track_actions[int(action)] += 1
    print(f"Sample {i+1}:")
    print("Observation:", obs)
    print("Predicted action:", action)
    print("Probabilities:", probs)
    print("-" * 40)
print("Action counts:", track_actions)

Episode 0 completed. Action counts: {'spt_policy': 0, 'fifo_policy': 0, 'shortest_queue_policy': 0, 'longest_queue_policy': 0, 'postpone': 0}
Total reward: 0. Total cycle time: 0
Episode 1 completed. Action counts: {'spt_policy': 0, 'fifo_policy': 0, 'shortest_queue_policy': 0, 'longest_queue_policy': 0, 'postpone': 0}
Total reward: 106.0719781770076. Total cycle time: 17976.371858905204
Episode 2 completed. Action counts: {'spt_policy': 0, 'fifo_policy': 0, 'shortest_queue_policy': 0, 'longest_queue_policy': 0, 'postpone': 0}
Total reward: 121.94777379255936. Total cycle time: 14264.943788374632
Episode 3 completed. Action counts: {'spt_policy': 0, 'fifo_policy': 0, 'shortest_queue_policy': 0, 'longest_queue_policy': 0, 'postpone': 0}
Total reward: 59.160138906912024. Total cycle time: 31901.13089756142
Episode 4 completed. Action counts: {'spt_policy': 0, 'fifo_policy': 0, 'shortest_queue_policy': 0, 'longest_queue_policy': 0, 'postpone': 0}
Total reward: 68.52315850221399. Total cyc