# Run Agent in Environment with different Algorithms

In [None]:
# Basics
import os
import warnings

# Data 
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
import numpy as np

# Logging
import logging
import wandb
from wandb.integration.sb3 import WandbCallback

from gym import make

## Register the Environment 

In [None]:
from gym.envs.registration import register
   
register(
    id="VPPBiddingEnv-TRAIN-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "DEBUG", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"training",
            'render_mode' :"human", # "human", "fast_training" or None
           }
)

register(
    id="VPPBiddingEnv-EVAL-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "DEBUG", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"eval",
            'render_mode' :"human", # "human", "fast_training" or None
           }
)

register(
    id="VPPBiddingEnv-TEST-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "INFO", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"test",
            'render_mode' :"human", # "human", "fast_training" or None
           }
)

## Test the environment

In [None]:
from stable_baselines3.common.env_checker import check_env
# It will check your custom environment and output additional warnings if needed
env_to_check = make('VPPBiddingEnv-TEST-v1', render_mode=None)
check_env(env_to_check)

# Stable Baselines

In [None]:
experiment_tags = ["no_render_in_reset", "distance_reward_slots", "not_participated_reward", "new_slot_viz" , "weighted_step_reward_out_of_loop"]
experiment_timesteps = 2785

### Offline Training and later sync logs

In [None]:
import os

os.environ["WANDB_API_KEY"] = "0cea1eee5f42654eca0de365f0acca116367c9b4"
os.environ["WANDB_MODE"] = "offline"

## SAC

### Train

In [None]:
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode="human")
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps #557
}

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["SAC"] + experiment_tags, 
    job_type="training"
)

model = SAC(config['policy'], env, verbose=0)


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=0))
wandb.finish()


In [None]:
!wandb sync wandb/latest-run

### Eval 

In [None]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode="human")
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["SAC"] + experiment_tags, 
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation)
        observation, reward, done, info = eval_env.step(action)
        if done:
            #print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()

In [None]:
!wandb sync wandb/latest-run

## A2C 

### Train

In [None]:
from stable_baselines3 import A2C
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode="human")
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps
}

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["A2C"] + experiment_tags, 
    job_type="training"
)

model = A2C(config['policy'], env, verbose=0)


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=0))
wandb.finish()


In [None]:
!wandb sync wandb/latest-run

### Eval 

In [None]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode="human")
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["A2C"] + experiment_tags, 
    job_type="eval",
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation)
        observation, reward, done, info = eval_env.step(action)
        if done:
            #print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()

In [None]:
!wandb sync wandb/latest-run

## Contrib packages: TQC

### Train

In [None]:
from sb3_contrib import TQC
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode="human")
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps #557
}

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TQC"] + experiment_tags, 
    job_type="training"
)


policy_kwargs = dict(n_critics=2, n_quantiles=25)
model = TQC(config['policy'], env, top_quantiles_to_drop_per_net=2, verbose=0, policy_kwargs=policy_kwargs)


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=0))
wandb.finish()


In [None]:
!wandb sync wandb/latest-run

### Eval 

In [None]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode="human")
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TQC"] + experiment_tags, 
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation, deterministic=True)
        observation, reward, done, info = eval_env.step(action)
        if done:
            #print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()

In [None]:
!wandb sync wandb/latest-run

## Contrib packages: TRPO

### Train

In [None]:
from sb3_contrib import TRPO
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode="human")
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps #557
}

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TRPO"] + experiment_tags, 
    job_type="training"
)

model = TRPO(config['policy'], env, verbose=0)


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=0))
wandb.finish()


In [None]:
%%time 

!wandb sync wandb/latest-run

### Eval 

In [None]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode="human")
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TRPO"] + experiment_tags, 
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation,  deterministic=True)
        observation, reward, done, info = eval_env.step(action)
        if done:
            #print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()

In [None]:
!wandb sync wandb/latest-run

## Contrib packages: RecurrentPPO 

### Train

In [None]:
from sb3_contrib import RecurrentPPO
import numpy as np
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode="human")
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputLstmPolicy',
    "total_timesteps": experiment_timesteps #557
}

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["RecurrentPPO"] + experiment_tags, 
    job_type="training"
)

model = RecurrentPPO(config['policy'], env, verbose=0)


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=0))
wandb.finish()


In [None]:
!wandb sync wandb/latest-run

### Eval 

In [None]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode="human")
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["RecurrentPPO"] + experiment_tags, 
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    lstm_states = None
    num_envs = 1
    # Episode start signals are used to reset the lstm states
    episode_starts = np.ones((num_envs,), dtype=bool)
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, lstm_states = model.predict(observation, state=lstm_states, episode_start=episode_starts, deterministic=True)
        observation, reward, dones, info = eval_env.step(action)
        episode_starts = dones
        if done:
            #print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()

In [None]:
!wandb sync wandb/latest-run

## TD3 

### Train

In [None]:
from stable_baselines3 import TD3
from stable_baselines3.common.noise import OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode="human")
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps # 1 iter = 557
}

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
#action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.34709686 * np.ones(n_actions))

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TD3"] + experiment_tags, 
    job_type="training"
)

model = TD3(config['policy'],
            env,
            verbose=0,
            tensorboard_log=f"runs/ddpg",
            gamma=0.99,
            batch_size=200, 
            buffer_size=1000000,
            learning_rate=0.2456,
            tau=0.001,
            action_noise=action_noise,
            policy_kwargs = {'net_arch': [64, 64]}
           )


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=0))
wandb.finish()


In [None]:
!wandb sync wandb/latest-run

### Eval

In [None]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode="human")
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TD3"] + experiment_tags, 
    job_type="eval",
    #settings=wandb.Settings(start_method="thread")
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation)
        observation, reward, done, info = eval_env.step(action)
        if done:
            #print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()

In [None]:
!wandb sync wandb/latest-run

## PPO

### Train

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode="human")
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps #557
}

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["PPO"] + experiment_tags, 
    job_type="training"
)

model = PPO(config['policy'], env, verbose=0)


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=0))
wandb.finish()


In [None]:
!wandb sync wandb/latest-run

### Eval 

In [None]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode="human")
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["PPO"] + experiment_tags, 
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation)
        observation, reward, done, info = eval_env.step(action)
        if done:
            #print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()

In [None]:
!wandb sync wandb/latest-run

## DDPG: Deep Deterministic Policy Gradient (DDPG) 

### Train

In [None]:
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode="human")
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps #557
}

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma= 0.1 * np.ones(n_actions))

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["DDPG"]+ experiment_tags, 
    job_type="training"
)

model = DDPG(config['policy'], env, action_noise=action_noise, verbose=0, tensorboard_log=f"runs/ddpg")

model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=0))
wandb.finish()


In [None]:
!wandb sync wandb/latest-run

### Eval 

In [None]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode="human")
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["DDPG"]+ experiment_tags, 
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation)
        observation, reward, done, info = eval_env.step(action)
        if done:
            #print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()

In [None]:
!wandb sync wandb/latest-run