# Run Agent in Environment with different Algorithms

In [1]:
# Basics
import os
import warnings

# Data 
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
import numpy as np

# Logging
import logging
import wandb
from wandb.integration.sb3 import WandbCallback

# Algorithms 
from stable_baselines3 import A2C
from stable_baselines3 import SAC
from stable_baselines3 import DDPG
from stable_baselines3 import PPO
from stable_baselines3 import TD3
from sb3_contrib import RecurrentPPO
from sb3_contrib import TQC
from sb3_contrib import TRPO
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_checker import check_env

import torch
import torch.nn as nn
from rl_zoo3 import linear_schedule

from gym import make
from gym.wrappers import RecordEpisodeStatistics

import subprocess
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

from vpp_gym.vpp_gym.utils.register_env import register_env

from stable_baselines3.common.vec_env import DummyVecEnv, SubprocVecEnv
from stable_baselines3.common.env_util import make_vec_env


## Seed Env

In [2]:
#SEED = 44

#torch.manual_seed(SEED)
#torch.cuda.manual_seed(SEED)
#torch.backends.cudnn.deterministic = True

## Register the Environment 

In [3]:
CONFIG_NAME = "vpp_config_1_training.json"

#register_env(config=CONFIG_NAME, seed=SEED)

## Test the environment

In [4]:
# It will check your custom environment and output additional warnings if needed
#env_to_check = make('VPPBiddingEnv-TEST-v1', render_mode=None)
#check_env(env_to_check)
#env_to_check.close()

# Stable Baselines

### Offline Training and later sync logs

In [5]:
os.environ["WANDB_API_KEY"] = "0cea1eee5f42654eca0de365f0acca116367c9b4"
os.environ["WANDB_MODE"] = "offline"

## Training Function

In [6]:
def train_algorithm(algorithm, seed): 
    env = make('VPPBiddingEnv-TRAIN-FAST-v1', render_mode="fast_training")
    env = Monitor(env) 
    env = RecordEpisodeStatistics(env)
    
    if algorithm == "R_PPO": 
        policy = 'MultiInputLstmPolicy'
    else: 
        policy = 'MultiInputPolicy'

    wandb.init(
        sync_tensorboard=True, 
        project="RL-VPP-Training",
        save_code=True,
        entity="jlu237", 
        tags=[algorithm] + EXPERIMENT_TAGS, 
        job_type="training"
    )
    
    model_params = HYPERPARAMS[algorithm]
    model = ALGORITHMS[algorithm](policy, env, verbose=0, seed=seed, **model_params)
    model.learn(total_timesteps=EXPERIMENT_TIMESTEPS,
                log_interval=1,
                progress_bar = True,
                callback=WandbCallback(
                    gradient_save_freq=1,
                    verbose=0))
    wandb.finish()
    return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)

    return return_code, model 


## Evaluate Function

In [7]:
def evaluate_algorithm(algorithm, model):
    eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode="human")
    eval_env = Monitor(eval_env) 
    eval_env = RecordEpisodeStatistics(eval_env) 

    wandb.init(
        sync_tensorboard=True, 
        project="RL-VPP-Evaluation",
        save_code=True,
        entity="jlu237", 
        tags=[algorithm] + EXPERIMENT_TAGS, 
        job_type="eval",
        config={"algo": algorithm}
    )

    episodes = 70
    for i_episode in range(episodes):
        observation = eval_env.reset()
        if algorithm == "R_PPO":
            lstm_states = None
            num_envs = 1
            # Episode start signals are used to reset the lstm states
            episode_starts = np.ones((num_envs,), dtype=bool)
            for t in range(1):
                action, lstm_states = model.predict(observation, state=lstm_states, episode_start=episode_starts, deterministic=True)
                observation, reward, dones, info = eval_env.step(action)
                episode_starts = dones
        else: 
            for t in range(1):
                action, _states = model.predict(observation, deterministic = True)
                observation, reward, done, info = eval_env.step(action)

    eval_env.close()
    wandb.finish()
    return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
    return return_code


In [8]:

# TD 3 Alternative 1

HYPERPARAMS = {

        "TD3": {'learning_rate': 0.0001811374579441975,
                'batch_size': 100,
                'buffer_size': 1000000,
                'tau': 0.08,
                'gamma': 0.99,
                'train_freq': 2,
                'gradient_steps': 1,
                'learning_starts': 10,
                'policy_delay': 1,
                'target_policy_noise': 0.3,
                'policy_kwargs': {'net_arch': [256, 256],
                                  'activation_fn': nn.ReLU}},


# TRPO Alternative 1

    "TRPO":{'learning_rate': linear_schedule(0.000017406994884083613),
             'n_steps': 10,
             'batch_size': 10,
             'gamma': 0.9999,
             'cg_max_steps': 20,
             'cg_damping': 0.5,
             'line_search_shrinking_factor': 0.7,
             'line_search_max_iter': 5,
             'n_critic_updates': 1,
             'gae_lambda': 0.9,
             'normalize_advantage': False,
             'use_sde': False,
             'target_kl': 0.02,
             'policy_kwargs': {'net_arch': [{'pi': [400, 400], 'vf': [400, 400]}],
                               'ortho_init': False,
                               'activation_fn': nn.ReLU}},


# RPPO Alternative 1 

    "R_PPO":{'learning_rate': linear_schedule(0.002677762425612007),
             'n_steps': 9,
             'batch_size': 9,
             'n_epochs': 10,
             'gamma': 0.99,
             'gae_lambda': 0.9,
             'clip_range': 0.4,
             'normalize_advantage': True,
             'ent_coef': 0.000002637536156684848,
             'vf_coef': 0.5221493086134156,
             'max_grad_norm': 0.6,
             'target_kl': 0.001,
             'policy_kwargs': {'net_arch': [{'pi': [64, 64], 'vf': [64, 64]}],
                               'full_std': True,
                               'activation_fn': nn.Tanh,
                               'ortho_init': True,
                               'log_std_init': -2.3151490699717723},
             'sde_sample_freq': 3}
    
}


In [9]:
ALGORITHMS = {
    "A2C": A2C,
    "DDPG": DDPG,
    "PPO": PPO,
    "SAC": SAC,
    "TD3": TD3,
    "TQC": TQC,
    "TRPO": TRPO,
    "R_PPO": RecurrentPPO,
}

In [10]:
EXPERIMENT_TAGS = ["config1_training", 
                   "11140",
                  ]

EXPERIMENT_TIMESTEPS = 11140  #2785 #5570  #11140 #22280


## Train all Algorithms

## Single Algorithm Training (FINAL)

In [11]:
# upload wandb logs 

#!wandb sync wandb/latest-run

## Single Algorithm Eval (FINAL)

## Single Algorithm Training

In [12]:
#algorithm_list = ["R_PPO", "TRPO", "TD3"]


ALGORITHM = "R_PPO"
seed = 44

EXPERIMENT_TAGS = ["config1_training",
                       (str(EXPERIMENT_TIMESTEPS) + " ep."),
                       ("S" + str(seed))
                    ]

 
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
register_env(config=CONFIG_NAME, seed=seed)

print("now training " + ALGORITHM)
return_code, model = train_algorithm(ALGORITHM, seed) 
print("training finished with : " + str(return_code))

now training R_PPO


Output()

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step_activ_count,▁▁▂▁▇▁▆▆▁▃▁▁▂▆▇█▅▇▃▂▅▂▃▆▂▃▆▆▁▂▃▂▂▇▇▇▇▇▁▇
step_activ_ratio,▁▁█▁█▁██▁█▁▁████████████████▁█████████▁█
step_lost_count,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step_not_activ_count,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step_not_part_count,▁▁▁▁▁▁▁▁▁▅▅▁▁▁▁▁▅▅▅▅█▅▅▅█▅██▅▅▅▅▅▅▅▅▅▅▅▅
step_not_res_count,▁▇▇█▂█▃▃█▅▇█▇▃▂▁▃▁▅▆▂▆▅▂▅▅▁▁▇▆▅▆▆▁▁▁▁▁▇▁
step_penalties,█▅▆▆▆▃▄▇▁▆▃▇▆▆████▇▆█▇███▇██▆█▇█████████
step_profit,▁▂▁▁▅▁▂▂▃▁▁▁▁█▃▆▁▇▁▁▁▁▂▂▇▂▆▂▁▁▃▁▁▄▂▃▁▃▁▃
step_res_count,▁▁▂▁▇▁▆▆▁▃▁▁▂▆▇█▅▇▃▂▅▂▃▆▂▃▆▆▁▂▃▂▂▇▇▇▇▇▁▇

0,1
global_step,11141.0
step_activ_count,5.0
step_activ_ratio,1.0
step_lost_count,0.0
step_not_activ_count,0.0
step_not_part_count,1.0
step_not_res_count,0.0
step_penalties,0.0
step_profit,1037.41
step_res_count,5.0


Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Training/runs/exb0vmi2 ... done.
training finished with : CompletedProcess(args='wandb sync wandb/latest-run', returncode=0)


## Single Algorithm Evaluation

In [13]:
return_code = evaluate_algorithm(ALGORITHM, model)
print("evaluation finished with : " + str(return_code))

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
step_activ_count,▁▁▁█▁▁▂▇████████████▇▅▁▁▁▂▁▁▁█▁▁▁▇██████
step_activ_ratio,▁▁▁█▁▁████████████████▁▁▁█▁▁▁█▁▁▁███████
step_lost_count,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step_not_activ_count,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step_not_part_count,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step_not_res_count,███▁██▇▂▁▁▁▁▁▁▁▁▁▁▁▁▂▄███▇███▁███▂▁▁▁▁▁▁
step_penalties,▆▆██▇▆▇███████████████▇███▄▅▇█▅▁▄███████
step_profit,▁▁▂▃▂▁▂▄▂▃▃▃▂▂▂▂▂▃▄▅▇▆▁▄▃▄▁▂▄▆▃▁▁█▇▇▄▇▅▄
step_res_count,▁▁▁█▁▁▂▇████████████▇▅▁▁▁▂▁▁▁█▁▁▁▇██████

0,1
global_step,69.0
step_activ_count,5.0
step_activ_ratio,1.0
step_lost_count,0.0
step_not_activ_count,0.0
step_not_part_count,1.0
step_not_res_count,0.0
step_penalties,0.0
step_profit,505.76
step_res_count,5.0


Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Evaluation/runs/38h8st8l ... done.
evaluation finished with : CompletedProcess(args='wandb sync wandb/latest-run', returncode=0)
