# Run Agent in Environment with different Algorithms

In [1]:
# Basics
import os
import warnings

# Data 
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
import numpy as np

# Logging
import logging
import wandb
from wandb.integration.sb3 import WandbCallback

# Algorithms 
from stable_baselines3 import A2C
from stable_baselines3 import SAC
from stable_baselines3 import DDPG
from stable_baselines3 import PPO
from stable_baselines3 import TD3
from sb3_contrib import RecurrentPPO
from sb3_contrib import TQC
from sb3_contrib import TRPO
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.env_checker import check_env

import torch
import torch.nn as nn
from rl_zoo3 import linear_schedule

from gym import make
from gym.wrappers import RecordEpisodeStatistics

import subprocess
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

from vpp_gym.vpp_gym.utils.register_env import register_env


## Seed Env

In [2]:
SEED = 42

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

## Register the Environment 

In [3]:
CONFIG_NAME = "vpp_config_2.json"

register_env(config=CONFIG_NAME, seed=SEED)

## Test the environment

In [4]:
# It will check your custom environment and output additional warnings if needed
env_to_check = make('VPPBiddingEnv-TEST-v1', render_mode=None)
check_env(env_to_check)
env_to_check.close()

log_step: 0 slot: None logging_step: 0


# Stable Baselines

### Offline Training and later sync logs

In [5]:
os.environ["WANDB_API_KEY"] = "0cea1eee5f42654eca0de365f0acca116367c9b4"
os.environ["WANDB_MODE"] = "offline"

## Training Function

In [6]:
def train_algorithm(algorithm): 
    #env = make('VPPBiddingEnv-TRAIN-v1', render_mode="human")
    env = make('VPPBiddingEnv-TUNING-v1')
    env = Monitor(env) 
    env = RecordEpisodeStatistics(env)

    if algorithm == "R_PPO": 
        policy = 'MultiInputLstmPolicy'
    else: 
        policy = 'MultiInputPolicy'

    wandb.init(
        sync_tensorboard=True, 
        project="RL-VPP-Training",
        save_code=True,
        entity="jlu237", 
        tags=[algorithm] + EXPERIMENT_TAGS, 
        job_type="training"
    )
    
    model_params = HYPERPARAMS[algorithm]
    model = ALGORITHMS[algorithm](policy, env, verbose=0,  seed = SEED, **model_params)
    model.learn(total_timesteps=EXPERIMENT_TIMESTEPS,
                log_interval=1,
                progress_bar = True,
                callback=WandbCallback(
                    gradient_save_freq=1,
                    verbose=0))
    wandb.finish()
    return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
    
    return return_code, model 


## Evaluate Function

In [7]:
def evaluate_algorithm(algorithm, model):
    eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode="human")
    eval_env = Monitor(eval_env) 
    eval_env = RecordEpisodeStatistics(eval_env) 

    wandb.init(
        sync_tensorboard=True, 
        project="RL-VPP-Evaluation",
        save_code=True,
        entity="jlu237", 
        tags=[algorithm] + EXPERIMENT_TAGS, 
        job_type="eval",
    )

    episodes = 140
    for i_episode in range(episodes):
        observation = eval_env.reset()
        if algorithm == "R_PPO":
            lstm_states = None
            num_envs = 1
            # Episode start signals are used to reset the lstm states
            episode_starts = np.ones((num_envs,), dtype=bool)
            for t in range(1):
                action, lstm_states = model.predict(observation, state=lstm_states, episode_start=episode_starts, deterministic=True)
                observation, reward, dones, info = eval_env.step(action)
                episode_starts = dones
        else: 
            for t in range(1):
                action, _states = model.predict(observation, deterministic = True)
                observation, reward, done, info = eval_env.step(action)

    eval_env.close()
    wandb.finish()
    return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
    return return_code


In [8]:
# activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]
#  NormalActionNoise(mean=[0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.],
#                                              sigma=[0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106]
#                                             )

HYPERPARAMS = {
    
    "A2C": {},
    
    "DDPG": {},
    
    "SAC": {'learning_rate': 0.010591885782399316,
            'batch_size': 100,
            'buffer_size': 100000,
            'learning_starts': 10, 
            'train_freq': 16,
            'gradient_steps': 2,
            'ent_coef': 0.05,
            'tau': 0.005,
            'gamma': 0.98,
            'policy_kwargs': {
                'net_arch': [256, 256],
                'activation_fn': nn.Tanh,
                'log_std_init': -3.4586660996768894,
                'use_sde': False},
            'sde_sample_freq': 0,
            'target_entropy': -10,
            'action_noise': NormalActionNoise(mean=[0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.],
                                              sigma=[0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106]
                                             )
           },
    
    "PPO": {},
    
    "TD3": {},
    
    #"DQN": {},
    
    # SB3 Contrib,
    "TQC": {},
    
    "TRPO": {'learning_rate': linear_schedule(0.04052181863791221),
             'n_steps': 2,
             'batch_size': 2,
             'gamma': 0.995,
             'cg_max_steps': 25,
             'cg_damping': 0.05,
             'line_search_shrinking_factor': 0.6,
             'line_search_max_iter': 5,
             'n_critic_updates': 20,
             'gae_lambda': 0.8,
             'normalize_advantage': False, 
             'use_sde': False,
             'target_kl': 0.1,
             'policy_kwargs': {'net_arch': [{'pi': [64, 64], 'vf': [64, 64]}],
                               'ortho_init': True,
                               'activation_fn': nn.ELU}},
    
    "R_PPO": {'learning_rate': 0.0018671792141252545,
              'n_steps': 10,
              'batch_size': 10,
              'n_epochs': 5,
              'gamma': 0.9999,
              'gae_lambda': 0.9,
              'clip_range': 0.2,
              'normalize_advantage': True,
              'ent_coef': 3.180891783226621e-06,
              'vf_coef': 0.8521929103610445,
              'max_grad_norm': 0.9,
              'target_kl': 0.001,
              'policy_kwargs': {'net_arch': [{'pi': [64, 64], 'vf': [64, 64]}],
                                'full_std': False,
                                'activation_fn': nn.ELU,
                                'ortho_init': False}}
            }

In [9]:
# R_PPO

{'learning_rate': linear_schedule(0.00047746791329352097),
              'n_steps': 8,
              'batch_size': 8,
              'n_epochs': 10,
              'gamma': 0.98,
              'gae_lambda': 0.9,
              'clip_range': 0.4,
              'normalize_advantage': True,
              'ent_coef': 0.03476154346691902,
              'vf_coef': 0.6589086411755256,
              'max_grad_norm': 5,
              'target_kl': 0.1,
              'policy_kwargs': {
                  'net_arch': [{'pi': [64, 64], 'vf': [64, 64]}],
                  'full_std': True,
                  'activation_fn': nn.Tanh,
                  'ortho_init': True,
                  'log_std_init': -2.303063874869516},
              'sde_sample_freq': -1}

{'learning_rate': <function rl_zoo3.utils.linear_schedule.<locals>.func(progress_remaining: float) -> float>,
 'n_steps': 8,
 'batch_size': 8,
 'n_epochs': 10,
 'gamma': 0.98,
 'gae_lambda': 0.9,
 'clip_range': 0.4,
 'normalize_advantage': True,
 'ent_coef': 0.03476154346691902,
 'vf_coef': 0.6589086411755256,
 'max_grad_norm': 5,
 'target_kl': 0.1,
 'policy_kwargs': {'net_arch': [{'pi': [64, 64], 'vf': [64, 64]}],
  'full_std': True,
  'activation_fn': torch.nn.modules.activation.Tanh,
  'ortho_init': True,
  'log_std_init': -2.303063874869516},
 'sde_sample_freq': -1}

In [10]:
ALGORITHMS = {
    "A2C": A2C,
    "DDPG": DDPG,
    "PPO": PPO,
    "SAC": SAC,
    "TD3": TD3,
    #"DQN": DQN,
    # SB3 Contrib,
    "TQC": TQC,
    "TRPO": TRPO,
    "R_PPO": RecurrentPPO,
}

In [11]:
EXPERIMENT_TAGS = ["^0.4",
                   "only_asset_data_FCR_total",
                   "config2", "rew=1 if not part",
                   "H1,2,3 + 0.5 FCR",
                   "less_observations",
                   "AS: MultiDiscrete"]
EXPERIMENT_TIMESTEPS = 11140 #2785 #5570  #2785 =  557 * 5 cycles of summer 2022 - summer 2022, 10 cycles = 5570


## Train all Algorithms

## Single Algorithm Training

In [12]:
ALGORITHM = "TRPO"

# MultiDiscrete 2: A2C, PPO, R_PPO, TRPO, M_PPO
# MultiDiscrete 1: funktionierte nicht mit A2C und R_PPO, aber evtl mit PPO, TRPO ? 

print("now training " + ALGORITHM)
return_code, model = train_algorithm(ALGORITHM) 
print("training finished with : " + str(return_code))

now training TRPO


Output()

## Single Algorithm Evaluation

In [None]:
return_code = evaluate_algorithm(ALGORITHM, model)
print("evaluation finished with : " + str(return_code))