In [None]:
import gymnasium as gym
import numpy as np
from gymnasium import spaces
from torch.utils.tensorboard.writer import SummaryWriter


class CPUSchedulerEnv(gym.Env):
    def __init__(self, jobs_data):
        super(CPUSchedulerEnv, self).__init__()

        self.jobs_data = jobs_data
        self.num_jobs = len(jobs_data)
        self.current_time = 0
        self.current_job = None
        self.waiting_jobs = []
        self.completed_jobs = []

        # Define action and observation spaces
        self.action_space = spaces.Discrete(
            self.num_jobs + 1
        )  # +1 for "do nothing" action

        # Observation space: [current_time, current_job_features, waiting_job_features]
        self.observation_space = spaces.Box(
            low=0,
            high=np.finfo(np.float32).max,
            shape=(1 + 5 + 5 * self.num_jobs,),
            dtype=np.float32,
        )

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.current_time = 0
        self.current_job = None
        self.waiting_jobs = self.jobs_data.copy()
        self.completed_jobs = []
        return self._get_observation(), {}

    def step(self, action):
        reward = 0
        done = False

        if action == self.num_jobs:  # "do nothing" action
            self.current_time += 1
        elif self.current_job is None and self.waiting_jobs:
            self.current_job = self.waiting_jobs.pop(0)
        elif self.waiting_jobs:
            # Handle preemption if applicable
            if self.current_job["Preemptive"] == 1:
                self.waiting_jobs.append(self.current_job)
                self.current_job = self.waiting_jobs.pop(0)

        # Progress current job
        if self.current_job is not None:
            self.current_job["Burst time"] -= 1
            if self.current_job["Burst time"] <= 0:
                self.completed_jobs.append(self.current_job)
                self.current_job = None

        # Calculate reward (example: negative waiting time)
        reward = -sum(
            [
                max(0, self.current_time - job["Arrival Time"])
                for job in self.waiting_jobs
            ]
        )

        # Check if all jobs are completed
        if len(self.completed_jobs) == self.num_jobs:
            done = True

        self.current_time += 1

        return self._get_observation(), reward, done, False, {}

    def _get_observation(self):
        obs = [float(self.current_time)]

        # Current job features
        if self.current_job is not None:
            obs.extend(
                [
                    float(self.current_job["Burst time"]),
                    float(self.current_job["Arrival Time"]),
                    float(self.current_job["Preemptive"]),
                    float(self.current_job["Resources"]),
                    float(self.current_time - self.current_job["Arrival Time"]),
                ]
            )
        else:
            obs.extend([0.0, 0.0, 0.0, 0.0, 0.0])

        # Waiting jobs features
        for job in self.waiting_jobs:
            obs.extend(
                [
                    float(job["Burst time"]),
                    float(job["Arrival Time"]),
                    float(job["Preemptive"]),
                    float(job["Resources"]),
                    float(self.current_time - job["Arrival Time"]),
                ]
            )

        # Pad with zeros if there are fewer waiting jobs than total jobs
        obs.extend([0.0] * 5 * (self.num_jobs - len(self.waiting_jobs)))

        return np.array(obs, dtype=np.float32)

In [6]:
import gymnasium as gym
from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import (
    EvalCallback,
    StopTrainingOnRewardThreshold,
)
from stable_baselines3.common.monitor import Monitor
import matplotlib.pyplot as plt

# Assuming CPUSchedulerEnv is defined as in the previous artifact
jobs_data = [
    {
        "Job Id": 247,
        "Burst time": 199,
        "Arrival Time": 0.41,
        "Preemptive": 0,
        "Resources": 8,
    },
    {
        "Job Id": 29,
        "Burst time": 193,
        "Arrival Time": 0.5925,
        "Preemptive": 1,
        "Resources": 2,
    },
    {
        "Job Id": 170,
        "Burst time": 75,
        "Arrival Time": 0.36,
        "Preemptive": 1,
        "Resources": 4,
    },
    {
        "Job Id": 164,
        "Burst time": 42,
        "Arrival Time": 0.9725,
        "Preemptive": 0,
        "Resources": 8,
    },
    {
        "Job Id": 312,
        "Burst time": 257,
        "Arrival Time": 0.6125,
        "Preemptive": 0,
        "Resources": 4,
    },
    {
        "Job Id": 36,
        "Burst time": 131,
        "Arrival Time": 0.6775,
        "Preemptive": 0,
        "Resources": 8,
    },
    {
        "Job Id": 99,
        "Burst time": 192,
        "Arrival Time": 0.8675,
        "Preemptive": 1,
        "Resources": 4,
    },
    {
        "Job Id": 30,
        "Burst time": 92,
        "Arrival Time": 0.435,
        "Preemptive": 1,
        "Resources": 8,
    },
]

# Create and wrap the environment
# env = DummyVecEnv([lambda: CPUSchedulerEnv(jobs_data)])

env = DummyVecEnv([lambda: Monitor(CPUSchedulerEnv(jobs_data))])

eval_env = DummyVecEnv([lambda: Monitor(CPUSchedulerEnv(jobs_data))])

tb_writer = SummaryWriter(log_dir="./tensorboard_logs/")


eval_callback = EvalCallback(
    eval_env,
    best_model_save_path="./logs/",
    log_path="./logs/",
    eval_freq=500,
    deterministic=True,
    render=False,
    callback_on_new_best=None,
    verbose=1,
)

# Initialize the agent
model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=0.0003,
    n_steps=2048,
    batch_size=64,
    n_epochs=10,
    gamma=0.99,
    tensorboard_log="./tensorboard_logs/",
    device="cuda",
)

# Train the agent
total_timesteps = 200000
# model.learn(total_timesteps=total_timesteps)
model.learn(
    total_timesteps=total_timesteps,
    callback=eval_callback,
    tb_log_name="ppo_cpu_scheduler",
)


# Evaluate the trained agent
mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10)
print(f"Mean reward: {mean_reward:.2f} +/- {std_reward:.2f}")

# Save the trained model
model.save("ppo_cpu_scheduler")

# Plotting training progress
results = np.load("./logs/evaluations.npz")
x = results["timesteps"]
y = results["results"]

plt.figure(figsize=(10, 5))
plt.plot(x, y)
plt.title("Training Progress")
plt.xlabel("Timesteps")
plt.ylabel("Mean Reward")
plt.savefig("training_progress.png")
plt.close()

# Print final metrics
print(f"Final evaluation over {len(y)} episodes:")
print(f"Mean reward: {y[-1].mean():.2f}")
print(f"Std of reward: {y[-1].std():.2f}")

tb_writer.close()

# To load the model later:
# loaded_model = PPO.load("ppo_cpu_scheduler")

Using cuda device
Logging to ./tensorboard_logs/ppo_cpu_scheduler_4
Eval num_timesteps=500, episode_reward=-160735.97 +/- 321387.55
Episode length: 143.20 +/- 270.40
----------------------------------
| eval/              |           |
|    mean_ep_length  | 143       |
|    mean_reward     | -1.61e+05 |
| time/              |           |
|    total_timesteps | 500       |
----------------------------------
New best mean reward!
Eval num_timesteps=1000, episode_reward=-56.90 +/- 0.00
Episode length: 9.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 9        |
|    mean_reward     | -56.9    |
| time/              |          |
|    total_timesteps | 1000     |
---------------------------------
New best mean reward!
Eval num_timesteps=1500, episode_reward=-124.99 +/- 0.00
Episode length: 12.00 +/- 0.00
---------------------------------
| eval/              |          |
|    mean_ep_length  | 12       |
|    mean_reward     | -125    

In [None]:
from stable_baselines3 import PPO


def evaluate_scheduler(env, model, n_episodes=100):
    total_reward = 0
    for _ in range(n_episodes):
        obs, _ = env.reset()
        done = False
        episode_reward = 0
        while not done:
            action, _ = model.predict(obs, deterministic=True)
            obs, reward, done, _, _ = env.step(action)
            episode_reward += reward
        total_reward += episode_reward
    return total_reward / n_episodes


# Load the trained model
model = PPO.load("ppo_cpu_scheduler")

# Create a new environment for evaluation
eval_env = CPUSchedulerEnv(jobs_data)

# Evaluate the model
avg_reward = evaluate_scheduler(eval_env, model)
print(f"Average reward over 100 episodes: {avg_reward:.2f}")

# You can also implement and evaluate traditional scheduling algorithms here
# for comparison, such as FCFS, SJF, or Round Robin.