# Run Agent in Environment with different Algorithms

In [1]:
# Basics
import os
import warnings

# Data 
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
import numpy as np

# Logging
import logging
import wandb
from wandb.integration.sb3 import WandbCallback
from stable_baselines3.common.monitor import Monitor

# Algorithms 
from stable_baselines3 import A2C
from stable_baselines3 import SAC
from stable_baselines3 import DDPG
from stable_baselines3 import PPO
from stable_baselines3 import TD3
from sb3_contrib import RecurrentPPO
from sb3_contrib import TQC
from sb3_contrib import TRPO
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
import torch.nn as nn
from rl_zoo3 import linear_schedule

from gym import make
from gym.envs.registration import register
from gym.wrappers import RecordEpisodeStatistics
from stable_baselines3.common.env_checker import check_env

import torch
import subprocess
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

## Register the Environment 

In [2]:
register(
    id="VPPBiddingEnv-TRAIN-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "DEBUG", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"training",
            'render_mode' :"human", # "human", "fast_training" or None
           }
)

register(
    id="VPPBiddingEnv-EVAL-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "DEBUG", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"eval",
            'render_mode' :"human", # "human", "fast_training" or None
           }
)

register(
    id="VPPBiddingEnv-TEST-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "INFO", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"test",
            'render_mode' :"human", # "human", "fast_training" or None
           }
)

## Test the environment

In [3]:
# It will check your custom environment and output additional warnings if needed
env_to_check = make('VPPBiddingEnv-TEST-v1', render_mode=None)
check_env(env_to_check)

log_step: initial // slot: initial  log level = info
log_step: 0 slot: None logging_step: 0
log_step: 0 slot: None Bid Submission time (D-1) = 2020-07-02 05:00:00+00:00
log_step: 0 slot: None Gate Closure time (D-1) = 2020-07-02 06:00:00+00:00
log_step: 0 slot: None Historic Data Window: from 2020-07-01 05:00:00+00:00 to 2020-07-02 04:45:00+00:00 
log_step: 0 slot: None Forecast Data Window: from 2020-07-02 22:00:00+00:00 to 2020-07-03 21:45:00+00:00 
log_step: 0 slot: 0 Current Slot Time: (D) = 2020-07-02 22:00:00+00:00
log_step: 0 slot: 0 agents_bid_size = 66
log_step: 0 slot: 0 agents_bid_price = 2293.053
log_step: 0 slot: 0 settlement_price_DE : 16.67
log_step: 0 slot: 0 self.activation_results['slots_won'] = 
log_step: 0 slot: 0
slot won: 	-1 
slot won: 	None 
slot won: 	None 
slot won: 	None 
slot won: 	None 
slot won: 	None
log_step: 0 slot: 0      agents bid_size = 
log_step: 0 slot: 0
size: 	66 
size: 	103 
size: 	28 
size: 	72 
size: 	82 
size: 	87
log_step: 0 slot: 0 self.ac

# Stable Baselines

### Offline Training and later sync logs

In [4]:
os.environ["WANDB_API_KEY"] = "0cea1eee5f42654eca0de365f0acca116367c9b4"
os.environ["WANDB_MODE"] = "offline"

## Training Function

In [5]:
def train_algo(algo): 
    env = make('VPPBiddingEnv-TRAIN-v1', render_mode="human")
    env = Monitor(env) 
    env = RecordEpisodeStatistics(env) # record stats such as returns
    
    if algo == "R_PPO": 
        policy = 'MultiInputLstmPolicy'
    else: 
        policy = 'MultiInputPolicy'

    wandb.init(
        sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
        project="RL-VPP-Training",
        monitor_gym=True,       # automatically upload gym environements' videos
        save_code=True,
        entity="jlu237", 
        tags=[algo] + EXPERIMENT_TAGS, 
        job_type="training"
    )
    
    model_params = HYPERPARAMS[algo]
    

    
    model = ALGOS[algo](policy, env, verbose=0,  seed = 1, **model_params)

    model.learn(total_timesteps=EXPERIMENT_TIMESTEPS,
                log_interval=1,
                progress_bar = True,
                callback=WandbCallback(
                    gradient_save_freq=1,
                    verbose=0))
    wandb.finish()
    return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
    
    return return_code, model 


## Evaluate Function

In [6]:
def evaluate_algo(algo, model):
    eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode="human")
    eval_env = Monitor(eval_env) 
    eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

    wandb.init(
        sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
        project="RL-VPP-Evaluation",
        save_code=True,
        entity="jlu237", 
        tags=[algo] + EXPERIMENT_TAGS, 
        job_type="eval",
    )

    episodes = 140
    for i_episode in range(episodes):
        observation = eval_env.reset()
        if algo == "R_PPO":
            lstm_states = None
            num_envs = 1
            # Episode start signals are used to reset the lstm states
            episode_starts = np.ones((num_envs,), dtype=bool)
            for t in range(1):
                action, lstm_states = model.predict(observation, state=lstm_states, episode_start=episode_starts, deterministic=True)
                observation, reward, dones, info = eval_env.step(action)
                episode_starts = dones
        else: 
            for t in range(1):
                action, _states = model.predict(observation, deterministic = True)
                observation, reward, done, info = eval_env.step(action)

    eval_env.close()
    wandb.finish()
    return_code = subprocess.run("wandb sync wandb/latest-run", shell=True)
    return return_code


In [7]:
# activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU, "elu": nn.ELU, "leaky_relu": nn.LeakyReLU}[activation_fn]
#  NormalActionNoise(mean=[0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.],
#                                              sigma=[0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106]
#                                             )

HYPERPARAMS = {
    
    "A2C": {},
    
    "DDPG": {},
    
    "SAC": {'learning_rate': 0.010591885782399316,
            'batch_size': 100,
            'buffer_size': 100000,
            'learning_starts': 10, 
            'train_freq': 16,
            'gradient_steps': 2,
            'ent_coef': 0.05,
            'tau': 0.005,
            'gamma': 0.98,
            'policy_kwargs': {
                'net_arch': [256, 256],
                'activation_fn': nn.Tanh,
                'log_std_init': -3.4586660996768894,
                'use_sde': False},
            'sde_sample_freq': 0,
            'target_entropy': -10,
            'action_noise': NormalActionNoise(mean=[0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.],
                                              sigma=[0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106,0.15087106]
                                             )
           },
    
    "PPO": {},
    
    "TD3": {},
    
    #"DQN": {},
    
    # SB3 Contrib,
    "TQC": {},
    
    "TRPO": {},
    
    "R_PPO": {'learning_rate': linear_schedule(0.00047746791329352097),
              'n_steps': 8,
              'batch_size': 8,
              'n_epochs': 10,
              'gamma': 0.98,
              'gae_lambda': 0.9,
              'clip_range': 0.4,
              'normalize_advantage': True,
              'ent_coef': 0.03476154346691902,
              'vf_coef': 0.6589086411755256,
              'max_grad_norm': 5,
              'target_kl': 0.1,
              'policy_kwargs': {
                  'net_arch': [{'pi': [64, 64], 'vf': [64, 64]}],
                  'full_std': True,
                  'activation_fn': nn.Tanh,
                  'ortho_init': True,
                  'log_std_init': -2.303063874869516},
              'sde_sample_freq': -1},
    
}

In [8]:
ALGOS = {
    "A2C": A2C,
    "DDPG": DDPG,
    "PPO": PPO,
    "SAC": SAC,
    "TD3": TD3,
    #"DQN": DQN,
    # SB3 Contrib,
    "TQC": TQC,
    "TRPO": TRPO,
    "R_PPO": RecurrentPPO,
}

In [9]:
EXPERIMENT_TAGS = ["new_metrics", "new_pipeline"]
EXPERIMENT_TIMESTEPS = 2785  #2785 =  557 * 5 cycles of summer 2022 - summer 2022, 10 cycles = 5570

## Train all Algorithms

## Single Algorithm 

In [10]:
ALGORITHM = "R_PPO"

try:
    print("now training " + ALGORITHM)
    return_code, model = train_algo(ALGORITHM) 
    print("training finished with : " + str(return_code))
    return_code = evaluate_algo(ALGORITHM, model)
    print("evaluation finished with : " + str(return_code))
except AssertionError as e:
    # Sometimes, random hyperparams can generate NaN
    print(e)

now training R_PPO


Output()

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step_activ_count,▁▁▁▁▁▁▁▁▁▁▅▂▂▂▇▄▄▄▇▇▄█▇█▇██▂▇▄██▅███▅██▁
step_activ_ratio,▁▁▁▁▁▁▁▁▁▁▅▃▅▅▇▆▆▆▇▇▆▇▇▇▇▇▇▅▇▆▇▇▆█▇▇██▇▁
step_lost_count,██████▇█▅▅▅▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▂▂▁▁▂▁
step_not_activ_count,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step_not_part_count,▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁██▁█
step_not_res_count,▁▁▁▁▁▁▂▁▅▅▁▅▇▅▂▅▅▄▂▂▅▁▂▁▂▁▁▇▂▅▁▁▄▁▁▁▄▁▁█
step_penalties,██████▅█▆▆█▅▃▁▇▅▇▇██▆██████▅████████▇██▆
step_profit,▁▁▁▁▁▁▁▁▁▁▂▁▁▂▂▃▁▁▁▃▁▅▂█▂▂▂▁▂▂▇▄▁▃▂▂▂▂▅▁
step_res_count,▁▁▁▁▁▁▁▁▁▁▅▂▂▂▇▄▄▄▇▇▄█▇█▇██▂▇▄██▅███▅██▁

0,1
global_step,2791.0
step_activ_count,5.0
step_activ_ratio,0.83333
step_lost_count,1.0
step_not_activ_count,0.0
step_not_part_count,0.0
step_not_res_count,0.0
step_penalties,0.0
step_profit,783.04
step_res_count,5.0


Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Training/runs/39lyp9dz ... done.
training finished with : CompletedProcess(args='wandb sync wandb/latest-run', returncode=0)


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
step_activ_count,▁▁██▄▁▁▇██████████████▇███▁▁██▄▁▁▇██████
step_activ_ratio,▁▁███▁▁████▇███▇██▇███████▁▁███▁▁████▇██
step_lost_count,▁▁▁▁▁▁▁▁▁▁▁█▁▁▁█▁▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█▁▁
step_not_activ_count,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step_not_part_count,███████████▁███▁██▁██████████████████▁██
step_not_res_count,██▁▁▅██▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁██▁▁▅██▂▁▁▁▁▁▁
step_penalties,▆▅██▇▆▇███████████████████▂▄██▆▁▆███████
step_profit,▁▁▃▃▂▁▂▄▂▃▃▃▂▂▂▃▂▃▄▆▇▆▄▅▅▅▁▁▆▆▄▁▄█▇▇▄█▅▄
step_res_count,▁▁██▄▁▁▇██████████████▇███▁▁██▄▁▁▇██████

0,1
global_step,139.0
step_activ_count,5.0
step_activ_ratio,1.0
step_lost_count,0.0
step_not_activ_count,0.0
step_not_part_count,1.0
step_not_res_count,0.0
step_penalties,0.0
step_profit,2956.64
step_res_count,5.0


Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Evaluation/runs/2al7ob4o ... done.
evaluation finished with : CompletedProcess(args='wandb sync wandb/latest-run', returncode=0)
