In [None]:
# Imports
# Gymnasium imports
import gymnasium as gym 
from gymnasium import Env
from gymnasium.spaces import Discrete, Box, Dict, Tuple, MultiBinary, MultiDiscrete 

import networkx as nx
from networkx.drawing.nx_agraph import graphviz_layout

# Import helpers
import numpy as np
import pandas as pd
import random
import os
import json
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import pickle

from datetime import datetime

from collections import deque

# Import stable baselines
from stable_baselines3 import PPO, A2C, DDPG
from stable_baselines3.common.vec_env import VecFrameStack, DummyVecEnv, VecNormalize
from stable_baselines3.common.env_util import make_vec_env, SubprocVecEnv
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.noise import NormalActionNoise

# Import tensorflow
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import TensorBoard

# Import custom classes
from Environment.env_v9 import *
from Functions.visualization_functions import *

In [None]:
# Configuration of the network
with open('./Config/network_config_v1.json') as file:
    network_config = file.read()

EP_LENGTH = 100 # Length of the episode

In [None]:
def load_config(config_file):

    with open(config_file, "r") as f:
        config = json.load(f)
    return config

def make_env(config_file="config.json"):

    config = load_config(config_file)

    env = SS_Mngmt_Env(
        network_config=network_config,
        EP_LENGTH=EP_LENGTH,
        render_mode="human",
        model_type="DDPG",
        stockout_cost=config["stockout_cost"],
        order_cost=config["order_cost"],
        item_cost=config["item_cost"],
        stock_cost=config["stock_cost"],
        item_prize=config["item_prize"],
        progressive_stock_cost=config["progressive_stock_cost"],
        stock_out_max=config["stock_out_max"],
        order_quantities=config["order_quantities"],
        demand_mean=config["demand_mean"],
        demand_std=config["demand_std"],
        demand_noise=config["demand_noise"],
        demand_noise_std=config["demand_noise_std"],
        demand_prob=config["demand_prob"],
    )

    # Wrap the environment with the monitor and for box actions
    return Monitor(MultiDiscreteToBoxWrapper(env))

env = make_env("./Config/env_config_v0.json")
check_env(env, warn=True)

In [None]:
vec_env = DummyVecEnv([lambda: make_env("./Config/env_config_v0.json")])
vec_env = VecNormalize(vec_env, norm_obs=True, norm_reward=True, clip_obs=10.0)

model_path = os.path.join('Training', 'Models', 'DDPG')
log_path = os.path.join('Training', 'Logs', 'DDPG')

In [None]:
def optimize_hyperparams(trial):
    # Suggest the most important hyperparameters
    lr = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    tau = trial.suggest_float("tau", 1e-3, 1e-1, log=True)
    gamma = trial.suggest_float("gamma", 0.9, 0.999)
    buffer_size = trial.suggest_int("buffer_size", 50000, 500000, step=50000)
    batch_size = trial.suggest_categorical("batch_size", [32, 64, 128, 256])
    noise_std = trial.suggest_float("noise_std", 0.1, 0.5)

    # Define action noise
    n_actions = env.action_space.shape[0]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=noise_std * np.ones(n_actions))

    # Create the DDPG model with static and optimized parameters
    model = DDPG(
        "MultiInputPolicy",
        vec_env,
        verbose=0,
        learning_rate=lr,  # Optimized
        buffer_size=buffer_size,  # Optimized
        batch_size=batch_size,  # Optimized
        tau=tau,  # Optimized
        gamma=gamma,  # Optimized
        train_freq=(1, "episode"),  # Static
        gradient_steps=1,  # Static
        learning_starts=100,  # Static
        tensorboard_log=log_path,  # Static
        action_noise=action_noise,  # Optimized
    )

    # Train the model
    model.learn(total_timesteps=200_000)

    # Evaluate the model
    rewards, _ = evaluate_policy(model, env, n_eval_episodes=5, return_episode_rewards=True)

    # Return the average reward as the objective to maximize
    return sum(rewards) / len(rewards)

# Run the hyperparameter optimization
study = optuna.create_study(direction="maximize")
study.optimize(optimize_hyperparams, n_trials=25)

# Print the best parameters
print("Best parameters:", study.best_params)

In [None]:
now = datetime.now()

# Save the study to a file
with open(f"./Training/PPO_optuna_study_{now.strftime("%Y-%m-%d_%H_%M")}.pkl", "wb") as f:
    pickle.dump(study, f)