# Run Agent in Environment with different Algorithms

In [1]:
# Basics
import os
import warnings

# Data 
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
import numpy as np

# Logging
import logging
import wandb
from wandb.integration.sb3 import WandbCallback

from gym import make

  if not hasattr(tensorboard, '__version__') or LooseVersion(tensorboard.__version__) < LooseVersion('1.15'):


## Register the Environment 

In [2]:
from gym.envs.registration import register
   
register(
    id="VPPBiddingEnv-TRAIN-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "DEBUG", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"training",
           }
)

'''register(
    id="VPPBiddingGoalEnv-TRAIN-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_GoalEnv:VPPBiddingGoalEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "DEBUG", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"training",
           }
)'''

register(
    id="VPPBiddingEnv-EVAL-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "DEBUG", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"eval",
           }
)

register(
    id="VPPBiddingEnv-TEST-v1",
    entry_point='vpp-gym.vpp_gym.envs.vpp_env:VPPBiddingEnv',
    max_episode_steps=1,
    kwargs={'config_path': "vpp_config_4.json",
            'log_level' : "INFO", # "DEBUG" , "INFO" or  "WARNING"
            'env_type' :"test",
           }
)

## Test the environment

In [3]:
from stable_baselines3.common.env_checker import check_env
# It will check your custom environment and output additional warnings if needed
env_to_check = make('VPPBiddingEnv-TEST-v1', render_mode=None)
check_env(env_to_check)

log level = info
Bid Submission time (D-1) = 2020-07-02 05:00:00+00:00
Gate Closure time (D-1) = 2020-07-02 06:00:00+00:00
Historic Data Window: from 2020-07-01 05:00:00+00:00 to 2020-07-02 04:45:00+00:00 
Forecast Data Window: from 2020-07-02 22:00:00+00:00 to 2020-07-03 21:45:00+00:00 
Current Slot Time: (D) = 2020-07-02 22:00:00+00:00
agents_bid_size = 16
agents_bid_price = 3033.846
settlement_price_DE : 16.67
self.delivery_results['slots_won'] = 
slot won: 	0
slot won: 	None
slot won: 	None
slot won: 	None
slot won: 	None
slot won: 	None
     agents bid_size = 
size: 	16
size: 	84
size: 	121
size: 	127
size: 	84
size: 	122
self.delivery_results['slot_settlement_prices_DE'] = 
price: 	16.67
price: 	0.0
price: 	0.0
price: 	0.0
price: 	0.0
price: 	0.0
Current Slot Time: (D) = 2020-07-03 02:00:00+00:00
agents_bid_size = 84
agents_bid_price = 856.40173
settlement_price_DE : 19.06
self.delivery_results['slots_won'] = 
slot won: 	0
slot won: 	0
slot won: 	None
slot won: 	None
slot won: 	N

# Stable Baselines

In [4]:
experiment_tags = ["won/lost slot +/-1000"]
experiment_timesteps = 2785

### Offline Training and later sync logs

In [5]:
import os

os.environ["WANDB_API_KEY"] = "0cea1eee5f42654eca0de365f0acca116367c9b4"
os.environ["WANDB_MODE"] = "offline"

### When Internet access:

In [None]:
!wandb sync wandb/

## SAC

### Train

In [6]:
from stable_baselines3 import SAC
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode=None)
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps #557
}

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["SAC"] + experiment_tags, 
    job_type="training"
)

model = SAC(config['policy'], env, verbose=0)


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=1))
wandb.finish()


  return LooseVersion(v) >= LooseVersion(check)
  from IPython.core.display import display, HTML  # type: ignore


Using cpu device
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -6000.0  |
| time/              |          |
|    episodes        | 1        |
|    fps             | 6        |
|    time_elapsed    | 0        |
|    total_timesteps | 1        |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -6000.0  |
| time/              |          |
|    episodes        | 2        |
|    fps             | 0        |
|    time_elapsed    | 2        |
|    total_timesteps | 2        |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -6000.0  |
| time/              |          |
|    episodes        | 3        |
|    fps             | 0        |
|    time_elapsed    | 3      


Passing a schema to Validator.iter_errors is deprecated and will be removed in a future release. Call validator.evolve(schema=new_schema).iter_errors(...) instead.


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step_profit,▅▅▅▅▅▅▅▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅█▆▅
step_reward,▅▅▅▅▅▅▅▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅█▇▅
total_profit,███▇▅▅▅▅▅▅▄▅▅▅▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▂▃▃▃▃▄▄▄▆▇
total_reward,███▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁

0,1
global_step,2784.0
step_profit,32.0
step_reward,-3736.0
total_profit,-55661.63114
total_reward,-12412996.01401


In [9]:
!wandb sync wandb/latest-run

Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Training/runs/33ln0ptp ... [34m[1mwandb[0m: Network error (TransientError), entering retry loop.
done.


### Eval 

In [7]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode=None)
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["SAC"] + experiment_tags, 
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation)
        observation, reward, done, info = eval_env.step(action)
        if done:
            print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



Episode: 0 Info: {'bid_submission_time': '2020-08-02 05:00:00+00:00', 'step_reward': -3760.96, 'step_profit': 19.52, 'total_reward': -3760.96, 'total_profit': 19.52, 'TimeLimit.truncated': False, 'episode': {'r': -3760.96, 'l': 1, 't': 1.485022}}




Episode: 1 Info: {'bid_submission_time': '2020-08-03 05:00:00+00:00', 'step_reward': -3754.0, 'step_profit': 23.0, 'total_reward': -7514.96, 'total_profit': 42.52, 'TimeLimit.truncated': False, 'episode': {'r': -3754.0, 'l': 1, 't': 2.060158}}
Episode: 2 Info: {'bid_submission_time': '2020-08-04 05:00:00+00:00', 'step_reward': -3869.45, 'step_profit': 15.27, 'total_reward': -11384.41, 'total_profit': 57.79, 'TimeLimit.truncated': False, 'episode': {'r': -3869.4543, 'l': 1, 't': 2.348583}}
Episode: 3 Info: {'bid_submission_time': '2020-08-05 05:00:00+00:00', 'step_reward': -3750.0, 'step_profit': 25.0, 'total_reward': -15134.41, 'total_profit': 82.79, 'TimeLimit.truncated': False, 'episode': {'r': -3750.0, 'l': 1, 't': 2.673008}}
Episode: 4 Info: {'bid_submission_time': '2020-08-06 05:00:00+00:00', 'step_reward': -3876.36, 'step_profit': 11.82, 'total_reward': -19010.78, 'total_profit': 94.61, 'TimeLimit.truncated': False, 'episode': {'r': -3876.3608, 'l': 1, 't': 2.920026}}
Episode: 5 


Passing a schema to Validator.iter_errors is deprecated and will be removed in a future release. Call validator.evolve(schema=new_schema).iter_errors(...) instead.


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
episode_reward,▂▂▂▂▂▂▂▂▂▃▃▂▂▂▂▂▄▇▂▃▃▃▂▂▂▃▂▁▃▃▃▂▃▃▂█▂▃▂▂
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
step_profit,▂▂▂▂▂▂▂▂▂▃▂▂▂▂▂▂▆▇▂▂▂▂▂▂▂▂▂▂▂▂▂▁▃▃▂█▂▂▂▂
step_reward,▂▂▂▂▂▂▂▂▂▃▃▂▂▂▂▂▄▇▂▃▃▃▂▂▂▃▂▁▃▃▃▂▃▃▂█▂▃▂▂
total_profit,▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▃▃▄▄▄▄▄▄▄▄▄▄▄▅▅▅▅▅▅▆████
total_reward,███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁

0,1
episode,139.0
episode_reward,-3687.2
global_step,139.0
mean_run_reward,-3246.72536
step_profit,56.4
step_reward,-3687.2
total_profit,44701.22648
total_reward,-454541.54703


In [None]:
!wandb sync wandb/latest-run

## TD3 

### Train

In [11]:
from stable_baselines3 import TD3
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode=None)
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps # 1 iter = 557
}

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
#action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=0.34709686 * np.ones(n_actions))

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TD3"] + experiment_tags, 
    job_type="training"
)

model = TD3(config['policy'],
            env,
            verbose=0,
            tensorboard_log=f"runs/ddpg",
            gamma=0.99,
            batch_size=200, 
            buffer_size=1000000,
            learning_rate=0.2456,
            tau=0.001,
            action_noise=action_noise,
            policy_kwargs = {'net_arch': [64, 64]}
           )


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=1))
wandb.finish()



Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display




Deprecated since Python 3.4. Use importlib.util.find_spec() instead.


Passing a schema to Validator.iter_errors is deprecated and will be removed in a future release. Call validator.evolve(schema=new_schema).iter_errors(...) instead.


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
rollout/ep_len_mean,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
rollout/ep_rew_mean,███▇▆▁▂▅▃▆▇▇▅▂▂▃▃▇▇▇▄▁▃▅▃▇▇▇▅▂▃▃▃▇▇▇▄▁▃▃
step_profit,██▇▇▆▄▄▇▇▆█▆▇▆▇▁█▆▇▇█▇▆▇███▇▅▅▄▃▇▇█▇▆▇█▄
step_reward,██▇▇▇▅▄▇█▆█▆▆▆▇▁█▆▇▇█▇▆▇███▇▅▅▅▄▇██▆▆▇█▅
time/fps,▁█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
total_profit,█████▇▇▇▇▇▇▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁
total_reward,█████▇▇▇▇▇▇▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▂▂▂▂▂▂▁▁
train/actor_loss,▂▁▂▃▇▆▆▆▄▄▆▅▇▅▆▆▇▆▆▆▅▆▅█▅▄▆▅▅▆▆▇▆▅▆▆▆▆▅▆
train/critic_loss,▁▁▁▁▄▄▅▆▇▅▅▅▆▆█▆▆▅▆▅▅█▅▆▆▆▆▅▄▅▅▇▅▇▇▆▅▅▆▇

0,1
global_step,2785.0
rollout/ep_len_mean,1.0
rollout/ep_rew_mean,-249929.29688
step_profit,-135660.81608
step_reward,-275321.63217
time/fps,2.0
total_profit,-282809196.1875
total_reward,-442305097.00798
train/actor_loss,220545.35938
train/critic_loss,37958320128.0


In [14]:
!wandb sync wandb/latest-run

Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Training/runs/1f9n4xgo ... [34m[1mwandb[0m: Network error (TransientError), entering retry loop.
[34m[1mwandb[0m: Network error (TransientError), entering retry loop.
[34m[1mwandb[0m: Network error (TransientError), entering retry loop.
[34m[1mwandb[0m: Network error (TransientError), entering retry loop.
done.


### Eval

In [12]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode=None)
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TD3"] + experiment_tags, 
    job_type="eval",
    #settings=wandb.Settings(start_method="thread")
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation)
        observation, reward, done, info = eval_env.step(action)
        if done:
            print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



Episode: 0 Info: {'bid_submission_time': '2020-08-02 05:00:00+00:00', 'step_reward': -117079.22, 'step_profit': -46084.33, 'total_reward': -117079.22, 'total_profit': -68994.89, 'TimeLimit.truncated': False, 'episode': {'r': -117079.22, 'l': 1, 't': 0.907365}}
Episode: 1 Info: {'bid_submission_time': '2020-08-03 05:00:00+00:00', 'step_reward': -96479.95, 'step_profit': -37944.86, 'total_reward': -213559.17, 'total_profit': -125529.99, 'TimeLimit.truncated': False, 'episode': {'r': -96479.95, 'l': 1, 't': 1.405828}}
Episode: 2 Info: {'bid_submission_time': '2020-08-04 05:00:00+00:00', 'step_reward': -104848.27, 'step_profit': -40686.38, 'total_reward': -318407.44, 'total_profit': -187691.88, 'TimeLimit.truncated': False, 'episode': {'r': -104848.266, 'l': 1, 't': 1.747669}}
Episode: 3 Info: {'bid_submission_time': '2020-08-05 05:00:00+00:00', 'step_reward': -113054.34, 'step_profit': -44538.17, 'total_reward': -431461.78, 'total_profit': -254208.05, 'TimeLimit.truncated': False, 'episod


Passing a schema to Validator.iter_errors is deprecated and will be removed in a future release. Call validator.evolve(schema=new_schema).iter_errors(...) instead.


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
episode_reward,▇▇▇▇▇▇▇▇▇█▇█▇▇▇███▇█▆▇▆▇▅▆▅▆▅▆▅▅▄▇▅▇▁▆▆▃
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
step_profit,▇▇▇▇▇▇▇▇▇█▇█▇▇▇███▇█▆▇▆▇▅▆▅▆▅▆▅▅▄▇▆▇▁▆▅▃
step_reward,▇▇▇▇▇▇▇▇▇█▇█▇▇▇███▇█▆▇▆▇▅▆▅▆▅▆▅▅▄▇▅▇▁▆▆▃
total_profit,█████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▃▃▃▃▂▂▁▁
total_reward,█████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▄▄▄▃▃▃▃▂▂▁▁

0,1
episode,139.0
episode_reward,-508718.05669
global_step,139.0
mean_run_reward,-157817.84929
step_profit,-202941.48265
step_reward,-508718.05669
total_profit,-13089142.17415
total_reward,-22094498.90192


In [13]:
!wandb sync wandb/latest-run

Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Evaluation/runs/3m58ee9w ... done.


### Tuning TD3

## A2C 

### Train

In [15]:
from stable_baselines3 import A2C
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode=None)
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps
}

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["A2C"] + experiment_tags, 
    job_type="training"
)

model = A2C(config['policy'], env, verbose=0)


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=1))
wandb.finish()



Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display




Passing a schema to Validator.iter_errors is deprecated and will be removed in a future release. Call validator.evolve(schema=new_schema).iter_errors(...) instead.


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step_profit,▇▃▆▅▇▇▇▅▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇█▇▇▇▇▇▇▇▇▇▁▇▄█▃
step_reward,▇▁▆▅▇▇▇▅▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇▇▇▇▇▁▇▃█▃
total_profit,█▇▇▇▇▆▆▅▅▅▅▅▅▅▄▄▄▄▄▄▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁
total_reward,███▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁

0,1
global_step,2784.0
step_profit,-22726.92858
step_reward,-49453.85715
total_profit,-6282464.96821
total_reward,-25850183.85075


In [16]:
!wandb sync wandb/latest-run

Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Training/runs/3ucj2ws5 ... done.


### Eval 

In [17]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode=None)
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["A2C"] + experiment_tags, 
    job_type="eval",
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation)
        observation, reward, done, info = eval_env.step(action)
        if done:
            print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



Episode: 0 Info: {'bid_submission_time': '2020-08-02 05:00:00+00:00', 'step_reward': -6000, 'step_profit': 0, 'total_reward': -6000.0, 'total_profit': 0.0, 'TimeLimit.truncated': False, 'episode': {'r': -6000.0, 'l': 1, 't': 0.902742}}
Episode: 1 Info: {'bid_submission_time': '2020-08-03 05:00:00+00:00', 'step_reward': -7173.61, 'step_profit': -1586.81, 'total_reward': -13173.61, 'total_profit': -1586.81, 'TimeLimit.truncated': False, 'episode': {'r': -7173.6123, 'l': 1, 't': 1.146415}}
Episode: 2 Info: {'bid_submission_time': '2020-08-04 05:00:00+00:00', 'step_reward': -6000, 'step_profit': 0, 'total_reward': -19173.61, 'total_profit': -1586.81, 'TimeLimit.truncated': False, 'episode': {'r': -6000.0, 'l': 1, 't': 1.363374}}
Episode: 3 Info: {'bid_submission_time': '2020-08-05 05:00:00+00:00', 'step_reward': -6000, 'step_profit': 0, 'total_reward': -25173.61, 'total_profit': -1586.81, 'TimeLimit.truncated': False, 'episode': {'r': -6000.0, 'l': 1, 't': 1.613624}}
Episode: 4 Info: {'bid


Passing a schema to Validator.iter_errors is deprecated and will be removed in a future release. Call validator.evolve(schema=new_schema).iter_errors(...) instead.


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
episode_reward,▄▄▄▄▄▄▄▆▆▁▄▇▄▄▄▄▄█▄▄▄▇▄▄▄▄▄▄▄▄▄▄▄▄▄▆▄▄▄▄
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
step_profit,▃▅▅▅▅▅▅▅▆▁▅▆▅▅▅▅▅█▅▅▅▇▅▅▅▅▅▅▅▅▅▅▅▅▅▆▅▅▅▅
step_reward,▄▄▄▄▄▄▄▆▆▁▄▇▄▄▄▄▄█▄▄▄▇▄▄▄▄▄▄▄▄▄▄▄▄▄▆▄▄▄▄
total_profit,████████████▇▇▇▇▇████▇▇▇▇▇▇▇▇▇▇▇▇▇▇██▁▁▁
total_reward,████▇▇▇▇▇▇▇▆▆▆▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▄▃▃▃▃▃▁▁▁

0,1
episode,139.0
episode_reward,-6000.0
global_step,139.0
mean_run_reward,-7042.27986
step_profit,0.0
step_reward,-6000.0
total_profit,-173380.53394
total_reward,-985919.18069


In [18]:
!wandb sync wandb/latest-run

Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Evaluation/runs/376vwecs ... done.


## PPO

### Train

In [19]:
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode=None)
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps #557
}

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["PPO"] + experiment_tags, 
    job_type="training"
)

model = PPO(config['policy'], env, verbose=0)


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=1))
wandb.finish()



Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display




Passing a schema to Validator.iter_errors is deprecated and will be removed in a future release. Call validator.evolve(schema=new_schema).iter_errors(...) instead.


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step_profit,███▇██▆█▇▆████▁▆▇████████▅█▇███▇▄████▆█▇
step_reward,███▇██▆█▇▇████▁▇▇████████▆█████▇▆████▇██
total_profit,████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁
total_reward,████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁▁

0,1
global_step,4095.0
step_profit,0.0
step_reward,-6000.0
total_profit,-75726859.93853
total_reward,-149741942.14198


In [20]:
!wandb sync wandb/latest-run

Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Training/runs/270z6gl2 ... [34m[1mwandb[0m: Network error (TransientError), entering retry loop.
done.


### Eval 

In [21]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode=None)
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["PPO"] + experiment_tags, 
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation)
        observation, reward, done, info = eval_env.step(action)
        if done:
            print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



Episode: 0 Info: {'bid_submission_time': '2020-08-02 05:00:00+00:00', 'step_reward': -50485.4, 'step_profit': -23242.7, 'total_reward': -50485.4, 'total_profit': -23242.7, 'TimeLimit.truncated': False, 'episode': {'r': -50485.4, 'l': 1, 't': 0.952134}}
Episode: 1 Info: {'bid_submission_time': '2020-08-03 05:00:00+00:00', 'step_reward': -40428.91, 'step_profit': -18214.46, 'total_reward': -90914.31, 'total_profit': -41457.16, 'TimeLimit.truncated': False, 'episode': {'r': -40428.91, 'l': 1, 't': 1.190251}}
Episode: 2 Info: {'bid_submission_time': '2020-08-04 05:00:00+00:00', 'step_reward': -43748.86, 'step_profit': -19874.43, 'total_reward': -134663.17, 'total_profit': -61331.58, 'TimeLimit.truncated': False, 'episode': {'r': -43748.855, 'l': 1, 't': 1.449345}}
Episode: 3 Info: {'bid_submission_time': '2020-08-05 05:00:00+00:00', 'step_reward': -38703.43, 'step_profit': -12753.18, 'total_reward': -173366.59, 'total_profit': -85281.83, 'TimeLimit.truncated': False, 'episode': {'r': -3870


Passing a schema to Validator.iter_errors is deprecated and will be removed in a future release. Call validator.evolve(schema=new_schema).iter_errors(...) instead.


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
episode_reward,▅▅▇▆▅▇▅▇▇▇▆▇▇▆▇▇▇▇▇▇▇▇▇▇▂▅▇▇▅▆▁▄▇▇▇█▇▇▇▇
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
step_profit,▅▆▇▆▅▇▅▇▇▇▆▇▇▆▇▇▇▇▇▇▇▇▇▇▂▅▇▇▅▆▁▄▇▇▇█▇▇▇▇
step_reward,▅▅▇▆▅▇▅▇▇▇▆▇▇▆▇▇▇▇▇▇▇▇▇▇▂▅▇▇▅▆▁▄▇▇▇█▇▇▇▇
total_profit,██████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▅▅▄▄▃▂▂▂▂▂▂▂▁▁
total_reward,████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▆▆▆▆▆▅▅▄▄▃▂▂▂▂▂▂▂▁▁

0,1
episode,139.0
episode_reward,-6000.0
global_step,139.0
mean_run_reward,-29002.85221
step_profit,0.0
step_reward,-6000.0
total_profit,-1984685.62304
total_reward,-4060399.30603


In [22]:
!wandb sync wandb/latest-run

Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Evaluation/runs/2z66qls8 ... done.


## DDPG: Deep Deterministic Policy Gradient (DDPG) 

### Train

In [None]:
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode=None)
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps #557
}

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma= 0.1 * np.ones(n_actions))

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["DDPG"]+ experiment_tags, 
    job_type="training"
)

model = DDPG(config['policy'], env, action_noise=action_noise, verbose=1, tensorboard_log=f"runs/ddpg")

model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=1))
wandb.finish()



Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



Using cpu device
Wrapping the env in a DummyVecEnv.




Logging to runs/ddpg/DDPG_165
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -6000.0  |
| time/              |          |
|    episodes        | 1        |
|    fps             | 2        |
|    time_elapsed    | 0        |
|    total_timesteps | 1        |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -6000.0  |
| time/              |          |
|    episodes        | 2        |
|    fps             | 1        |
|    time_elapsed    | 1        |
|    total_timesteps | 2        |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -6000.0  |
| time/              |          |
|    episodes        | 3        |
|    fps             | 1        |
|    time_elapsed    | 1        |
|    total_timeste

In [27]:
!wandb sync wandb/latest-run

Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Evaluation/runs/1ymqh0sb ... done.


### Eval 

In [30]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode=None)
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["DDPG"]+ experiment_tags, 
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation)
        observation, reward, done, info = eval_env.step(action)
        if done:
            print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



Episode: 0 Info: {'bid_submission_time': '2020-08-02 05:00:00+00:00', 'step_reward': -6000, 'step_profit': 0, 'total_reward': -6000.0, 'total_profit': 0.0, 'TimeLimit.truncated': False, 'episode': {'r': -6000.0, 'l': 1, 't': 1.006862}}
Episode: 1 Info: {'bid_submission_time': '2020-08-03 05:00:00+00:00', 'step_reward': -6000, 'step_profit': 0, 'total_reward': -12000.0, 'total_profit': 0.0, 'TimeLimit.truncated': False, 'episode': {'r': -6000.0, 'l': 1, 't': 1.447718}}
Episode: 2 Info: {'bid_submission_time': '2020-08-04 05:00:00+00:00', 'step_reward': -6000, 'step_profit': 0, 'total_reward': -18000.0, 'total_profit': 0.0, 'TimeLimit.truncated': False, 'episode': {'r': -6000.0, 'l': 1, 't': 1.825103}}
Episode: 3 Info: {'bid_submission_time': '2020-08-05 05:00:00+00:00', 'step_reward': -6000, 'step_profit': 0, 'total_reward': -24000.0, 'total_profit': 0.0, 'TimeLimit.truncated': False, 'episode': {'r': -6000.0, 'l': 1, 't': 2.115309}}
Episode: 4 Info: {'bid_submission_time': '2020-08-06 


Passing a schema to Validator.iter_errors is deprecated and will be removed in a future release. Call validator.evolve(schema=new_schema).iter_errors(...) instead.


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
episode_reward,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
step_profit,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step_reward,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
total_profit,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
total_reward,███▇▇▇▇▇▇▆▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▁▁▁

0,1
episode,139.0
episode_reward,-6000.0
global_step,139.0
mean_run_reward,-6000.0
step_profit,0.0
step_reward,-6000.0
total_profit,0.0
total_reward,-840000.0


In [31]:
!wandb sync wandb/latest-run

Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Evaluation/runs/1djf25io ... done.


## HER 

- HER is an algorithm that works with off-policy methods (SAC,TQC, TD3 and DDPG)
- HER is no longer a separate algorithm but a replay buffer class HerReplayBuffer that must be passed to an off-policy algorithm when using MultiInputPolicy (to have Dict observation support).
- HER requires the environment to inherits from gym.GoalEnv
- For performance reasons, the maximum number of steps per episodes must be specified. In most cases, it will be inferred if you specify max_episode_steps when registering the environment or if you use a gym.wrappers.TimeLimit (and env.spec is not None). Otherwise, you can directly pass max_episode_length to the model constructor

### Train

### Eval 

## Contrib packages: TQC

### Train

In [32]:
from sb3_contrib import TQC
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode=None)
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps #557
}

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TQC"] + experiment_tags, 
    job_type="training"
)


policy_kwargs = dict(n_critics=2, n_quantiles=25)
model = TQC(config['policy'], env, top_quantiles_to_drop_per_net=2, verbose=1, policy_kwargs=policy_kwargs)


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=1))
wandb.finish()



Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



Using cpu device
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -6000.0  |
| time/              |          |
|    episodes        | 1        |
|    fps             | 5        |
|    time_elapsed    | 0        |
|    total_timesteps | 1        |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -6000.0  |
| time/              |          |
|    episodes        | 2        |
|    fps             | 4        |
|    time_elapsed    | 0        |
|    total_timesteps | 2        |
---------------------------------
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 1        |
|    ep_rew_mean     | -6000.0  |
| time/              |          |
|    episodes        | 3        |
|    fps             | 4        |
|    time_elapsed    | 0      


Passing a schema to Validator.iter_errors is deprecated and will be removed in a future release. Call validator.evolve(schema=new_schema).iter_errors(...) instead.


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step_profit,███████▄▅██▁██▄█████████████████████████
step_reward,▇▇▇▇▇█▇▄▅▇▇▁▇▇▄█▇█████████▇█████████████
total_profit,█████▆▆▅▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
total_reward,████▇▆▆▅▄▄▄▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
global_step,2784.0
step_profit,73.6
step_reward,-1415.2
total_profit,-6653133.56493
total_reward,-21980410.4159


In [33]:
!wandb sync wandb/latest-run

Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Training/runs/iphvv4pr ... [34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (Con

### Eval 

In [34]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode=None)
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TQC"] + experiment_tags, 
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation, deterministic=True)
        observation, reward, done, info = eval_env.step(action)
        if done:
            print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



Episode: 0 Info: {'bid_submission_time': '2020-08-02 05:00:00+00:00', 'step_reward': -1502.17, 'step_profit': 37.37, 'total_reward': -1502.17, 'total_profit': 60.46, 'TimeLimit.truncated': False, 'episode': {'r': -1502.17, 'l': 1, 't': 1.035047}}
Episode: 1 Info: {'bid_submission_time': '2020-08-03 05:00:00+00:00', 'step_reward': -1487.74, 'step_profit': 43.81, 'total_reward': -2989.91, 'total_profit': 128.91, 'TimeLimit.truncated': False, 'episode': {'r': -1487.74, 'l': 1, 't': 1.332041}}
Episode: 2 Info: {'bid_submission_time': '2020-08-04 05:00:00+00:00', 'step_reward': -1669.25, 'step_profit': 52.09, 'total_reward': -4659.16, 'total_profit': 207.57, 'TimeLimit.truncated': False, 'episode': {'r': -1669.2521, 'l': 1, 't': 1.61975}}
Episode: 3 Info: {'bid_submission_time': '2020-08-05 05:00:00+00:00', 'step_reward': -1689.88, 'step_profit': 43.31, 'total_reward': -6349.04, 'total_profit': 274.39, 'TimeLimit.truncated': False, 'episode': {'r': -1689.8757, 'l': 1, 't': 1.878274}}
Episod


Passing a schema to Validator.iter_errors is deprecated and will be removed in a future release. Call validator.evolve(schema=new_schema).iter_errors(...) instead.


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
episode,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
episode_reward,▃▁▃▃▃▂▃▃▂▃▃▃▂▂▂▃▃▃▃▄▅▅▄▄▄▄▄▂▅▅▅▇▆▆▇█▅▇▄▄
global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
step_profit,▁▁▂▂▂▂▃▂▁▂▁▂▁▁▁▂▁▂▂▄▅▅▃▄▃▄▄▃▄▄▅█▇▆▆█▄█▃▄
step_reward,▃▁▃▃▃▂▃▃▂▃▃▃▂▂▂▃▃▃▃▄▅▅▄▄▄▄▄▂▅▅▅▇▆▆▇█▅▇▄▄
total_profit,▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▆▆▇▇▇██
total_reward,███▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁

0,1
episode,139.0
episode_reward,-1245.0
global_step,139.0
mean_run_reward,-1289.11407
step_profit,135.0
step_reward,-1245.0
total_profit,28169.65072
total_reward,-180475.96838


In [35]:
!wandb sync wandb/latest-run

Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Evaluation/runs/4pfhsrwy ... done.


## Contrib packages: TRPO

### Train

In [36]:
from sb3_contrib import TRPO
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode=None)
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputPolicy',
    "total_timesteps": experiment_timesteps #557
}

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TRPO"] + experiment_tags, 
    job_type="training"
)

model = TRPO(config['policy'], env, verbose=1)


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=1))
wandb.finish()



Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



Using cpu device
Wrapping the env in a DummyVecEnv.
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 1         |
|    ep_rew_mean     | -69038.03 |
| time/              |           |
|    fps             | 3         |
|    iterations      | 1         |
|    time_elapsed    | 586       |
|    total_timesteps | 2048      |
----------------------------------
-----------------------------------------
| rollout/                  |           |
|    ep_len_mean            | 1         |
|    ep_rew_mean            | -10044.82 |
| time/                     |           |
|    fps                    | 3         |
|    iterations             | 2         |
|    time_elapsed           | 1160      |
|    total_timesteps        | 4096      |
| train/                    |           |
|    explained_variance     | -2.62e-06 |
|    is_line_search_success | 1         |
|    kl_divergence_loss     | 0.00822   |
|    learning_rate          | 0.001     |
|    n_upd


Passing a schema to Validator.iter_errors is deprecated and will be removed in a future release. Call validator.evolve(schema=new_schema).iter_errors(...) instead.


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
step_profit,████████▄▁▅▆▇▇▂██▇█▄█▇██▂▂▆▆███▁█▆█▄███▇
step_reward,████████▅▁▅▇▇▇▂██▇█▅████▃▄▇▇███▃█▇█▅███▇
total_profit,████▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▁▁▁
total_reward,████▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▁▁▁▁

0,1
global_step,4095.0
step_profit,2736.63
step_reward,6242.86
total_profit,-60092148.54263
total_reward,-125321544.17938


In [37]:
%%time 

!wandb sync wandb/latest-run

Find logs at: /Users/Jan-Lukas.Pflaum/Dev/masterthesis/wandb/debug-cli.Jan-Lukas.Pflaum.log
Syncing: https://wandb.ai/jlu237/RL-VPP-Training/runs/21h3l549 ... [34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (ConnectTimeout), entering retry loop.
[34m[1mwandb[0m: Network error (Con

### Eval 

In [38]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode=None)
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["TRPO"], 
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, _states = model.predict(observation,  deterministic=True)
        observation, reward, done, info = eval_env.step(action)
        if done:
            print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()


Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display



Episode: 0 Info: {'bid_submission_time': '2020-08-02 05:00:00+00:00', 'step_reward': -6000, 'step_profit': 0, 'total_reward': -6000.0, 'total_profit': 0.0, 'TimeLimit.truncated': False, 'episode': {'r': -6000.0, 'l': 1, 't': 0.963531}}
Episode: 1 Info: {'bid_submission_time': '2020-08-03 05:00:00+00:00', 'step_reward': -6000, 'step_profit': 0, 'total_reward': -12000.0, 'total_profit': 0.0, 'TimeLimit.truncated': False, 'episode': {'r': -6000.0, 'l': 1, 't': 1.228084}}
Episode: 2 Info: {'bid_submission_time': '2020-08-04 05:00:00+00:00', 'step_reward': -6000, 'step_profit': 0, 'total_reward': -18000.0, 'total_profit': 0.0, 'TimeLimit.truncated': False, 'episode': {'r': -6000.0, 'l': 1, 't': 1.507595}}
Episode: 3 Info: {'bid_submission_time': '2020-08-05 05:00:00+00:00', 'step_reward': -6000, 'step_profit': 0, 'total_reward': -24000.0, 'total_profit': 0.0, 'TimeLimit.truncated': False, 'episode': {'r': -6000.0, 'l': 1, 't': 1.762765}}
Episode: 4 Info: {'bid_submission_time': '2020-08-06 

KeyboardInterrupt: 

In [None]:
!wandb sync wandb/latest-run

## Contrib packages: RecurrentPPO 

### Train

In [None]:
from sb3_contrib import RecurrentPPO
import numpy as np
from stable_baselines3.common.monitor import Monitor
from gym.wrappers import RecordEpisodeStatistics
from wandb.integration.sb3 import WandbCallback

env = make('VPPBiddingEnv-TRAIN-v1', render_mode=None)
env = Monitor(env) 
env = RecordEpisodeStatistics(env) # record stats such as returns


config = {
    "policy": 'MultiInputLstmPolicy',
    "total_timesteps": experiment_timesteps #557
}

wandb.init(
    config=config,
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Training",
    monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["RecurrentPPO"] + experiment_tags, 
    job_type="training"
)

model = RecurrentPPO(config['policy'], env, verbose=1)


model.learn(total_timesteps=config['total_timesteps'],
            log_interval=1,
            callback=WandbCallback(
                gradient_save_freq=1,
                verbose=1))
wandb.finish()


In [None]:
!wandb sync wandb/latest-run

### Eval 

In [None]:
from stable_baselines3.common.monitor import Monitor

eval_env = make('VPPBiddingEnv-EVAL-v1', render_mode=None)
eval_env = RecordEpisodeStatistics(eval_env) # record stats such as returns

wandb.init(
    sync_tensorboard=True,  # automatically upload SB3's tensorboard metrics to W&B
    project="RL-VPP-Evaluation",
    #monitor_gym=True,       # automatically upload gym environements' videos
    save_code=True,
    entity="jlu237", 
    tags=["RecurrentPPO"] + experiment_tags, 
    job_type="eval"
)


tbl = wandb.Table(columns=["episode", "bid_submission_time"])

episodes = 140

for i_episode in range(episodes):
    observation = eval_env.reset()
    lstm_states = None
    num_envs = 1
    # Episode start signals are used to reset the lstm states
    episode_starts = np.ones((num_envs,), dtype=bool)
    for t in range(1):
        eval_env.render()
        logging.debug("observation : " + str(observation))
        action, lstm_states = model.predict(observation, state=lstm_states, episode_start=episode_starts, deterministic=True)
        observation, reward, done, info = eval_env.step(action)
        episode_starts = dones
        if done:
            print('Episode: {} Info: {}'.format(i_episode, info))
            tbl.add_data(i_episode, info["bid_submission_time"])
            wandb.log({"episode_reward": reward,
                       "episode": i_episode
                      })
            
            break
wandb.log({"bid_submission_time" : tbl})

eval_env.close()
mean_run_reward = info["total_reward"] / episodes

wandb.run.summary["mean_run_reward"] = mean_run_reward
print("Mean Run Reward: " + str(mean_run_reward))
wandb.finish()

In [None]:
!wandb sync wandb/latest-run

## Contrib packages: ARS

### Train

### Eval 

## Tuning DDPG

### Parameters

- policy = "MlpPolicy" , "CnnPolicy" , "MultiInputPolicy"
- **learning_rate** = staic or range(1,0)
- buffer_size (int) – size of the replay buffer
- **learning_starts (int)** – how many steps of the model to collect transitions for before learning starts
    -  For a fixed number of steps at the beginning (set with the start_steps keyword argument), the agent takes actions which are sampled from a uniform random distribution over valid actions. After that, it returns to normal DDPG exploration.
- batch_size (int) – Minibatch size for each gradient update
- **tau (float)** – the soft update coefficient (“Polyak update”, between 0 and 1)
- gamma (float) – the discount factor
- train_freq (Union[int, Tuple[int, str]]) – Update the model every train_freq steps. Alternatively pass a tuple of frequency and unit like (5, "step") or (2, "episode").
- gradient_steps (int) – How many gradient steps to do after each rollout (see train_freq) Set to -1 means to do as many gradient steps as steps done in the environment during the rollout.
- action_noise (Optional[ActionNoise]) – the action noise type (None by default), this can help for hard exploration problem. Cf common.noise for the different action noise type.
    -  uncorrelated, mean-zero Gaussian noise works perfectly well. 
    -  To facilitate getting higher-quality training data, you may reduce the scale of the noise over the course of training. (We do not do this in our implementation, and keep noise scale fixed throughout.)


- replay_buffer_class (Optional[ReplayBuffer]) – Replay buffer class to use (for instance HerReplayBuffer). If None, it will be automatically selected.
- optimize_memory_usage (bool) – Enable a memory efficient variant of the replay buffer at a cost of more complexity. See https://github.com/DLR-RM/stable-baselines3/issues/37#issuecomment-637501195
- create_eval_env (bool) – Whether to create a second environment that will be used for evaluating the agent periodically. (Only available when passing string for the environment)

- seed (Optional[int]) – Seed for the pseudo random generators
- _init_setup_model (bool) – Whether or not to build the network at the creation of the instance





stable_baselines3.ddpg.MlpPolicy Parameters
- lr_schedule (Callable[[float], float]) – Learning rate schedule (could be constant)
- n_critics (int) – Number of critic networks to create.

stable_baselines3.ddpg.MlpPolicy.set_training_mode()
- mode (bool) – if true, set to training mode, else set to evaluation mode

stable_baselines3.ddpg.CnnPolicy

stable_baselines3.ddpg.MultiInputPolicy


In [None]:
# hide all deprecation warnings from tensorflow
#import tensorflow as tf
#tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

import optuna

#from stable_baselines import PPO2
from stable_baselines3 import DDPG
from stable_baselines3 import HerReplayBuffer
from gym.wrappers import RecordEpisodeStatistics
from stable_baselines3.common.noise import NormalActionNoise
#from stable_baselines.common.evaluation import evaluate_policy
#from stable_baselines.common.cmd_util import make_vec_env

# https://colab.research.google.com/github/araffin/rl-tutorial-jnrr19/blob/master/5_custom_gym_env.ipynb
#from custom_env import GoLeftEnv

# The noise objects for DDPG
n_actions = env.action_space.shape[-1]
normal_action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))


def optimize_ddpg(trial):
    """ Learning hyperparamters we want to optimise"""
    
    replay_buffer_class = trial.suggest_categorical("replay_buffer_class", ["HER", "None"])
    replay_buffer_class = {"HER": HerReplayBuffer, "None": None}[replay_buffer_class]
    
    action_noise = trial.suggest_categorical("action_noise", ["action_noise", "None"])
    action_noise = {"action_noise": normal_action_noise, "None": None}[action_noise]
    
    params =  {
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.0001, 1.0), #default: 0.001
        'learning_starts': int(trial.suggest_int('learning_starts', 0, 200, 10)),  #default: 100
        'batch_size': int(trial.suggest_int('batch_size', 0, 200,10)),  #default: 100
        'tau': trial.suggest_loguniform('tau', 0.001, 1.0), #default: 0.005
        'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999), # default: gamma=0.99
        'replay_buffer_class' : replay_buffer_class,
        'action_noise' : action_noise
    }
    
    return params
        



def optimize_agent(trial):
    """ Train the model and optimize
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    
    model_params = optimize_ddpg(trial)
    
    # init tracking experiment.
    # hyper-parameters, trial id are stored.
    config = dict(trial.params)
    config["trial.number"] = trial.number
    wandb.init(
        project="RL-optuna",
        entity="jlu237", 
        sync_tensorboard=True,
        config=config,
        reinit=True
    )
    
    env = make('VPPBiddingEnv-TRAIN-v1', render_mode=None)
    env = Monitor(env) 
    env = RecordEpisodeStatistics(env) # record stats such as returns


    model = DDPG('MultiInputPolicy', env, verbose=0, tensorboard_log=f"runs/ddpg", seed = 1, **model_params)
    model.learn(total_timesteps=557, log_interval=1)
    
    wandb.finish()
    
study = optuna.create_study()
try:
    study.optimize(optimize_agent, n_trials=20)
except KeyboardInterrupt:
    print('Interrupted by keyboard.')

In [None]:
env = make('VPPBiddingEnv-TRAIN-v1', render_mode=None)
env.observation_space.spaces["observation"]

In [None]:
print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


In [None]:
model.get_parameters()["critic.optimizer"]["param_groups"]

In [None]:
model.get_parameters()["actor.optimizer"]["param_groups"]

In [None]:
# !apt-get install swig cmake ffmpeg freeglut3-dev xvfb

In [None]:
# Alternative from araffin for optuna from: https://github.com/optuna/optuna-examples/blob/52ed3aff3e3e936be3873b5acc6ee3ccdadea914/rl/sb3_simple.py#L60

""" Optuna example that optimizes the hyperparameters of
a reinforcement learning agent using A2C implementation from Stable-Baselines3
on a OpenAI Gym environment.

This is a simplified version of what can be found in https://github.com/DLR-RM/rl-baselines3-zoo.

You can run this example as follows:
    $ python sb3_simple.py

"""
from typing import Any
from typing import Dict

import gym
import optuna
from optuna.pruners import MedianPruner
from optuna.samplers import TPESampler
from stable_baselines3 import A2C
from stable_baselines3.common.callbacks import EvalCallback
import torch
import torch.nn as nn


N_TRIALS = 100
N_STARTUP_TRIALS = 5
N_EVALUATIONS = 2
N_TIMESTEPS = int(2e4)
EVAL_FREQ = int(N_TIMESTEPS / N_EVALUATIONS)
N_EVAL_EPISODES = 3

ENV_ID = "CartPole-v1"

DEFAULT_HYPERPARAMS = {
    "policy": "MlpPolicy",
    "env": ENV_ID,
}


def sample_a2c_params(trial: optuna.Trial) -> Dict[str, Any]:
    """Sampler for A2C hyperparameters."""
    gamma = 1.0 - trial.suggest_float("gamma", 0.0001, 0.1, log=True)
    max_grad_norm = trial.suggest_float("max_grad_norm", 0.3, 5.0, log=True)
    gae_lambda = 1.0 - trial.suggest_float("gae_lambda", 0.001, 0.2, log=True)
    n_steps = 2 ** trial.suggest_int("exponent_n_steps", 3, 10)
    learning_rate = trial.suggest_float("lr", 1e-5, 1, log=True)
    ent_coef = trial.suggest_float("ent_coef", 0.00000001, 0.1, log=True)
    ortho_init = trial.suggest_categorical("ortho_init", [False, True])
    net_arch = trial.suggest_categorical("net_arch", ["tiny", "small"])
    activation_fn = trial.suggest_categorical("activation_fn", ["tanh", "relu"])

    # Display true values
    trial.set_user_attr("gamma_", gamma)
    trial.set_user_attr("gae_lambda_", gae_lambda)
    trial.set_user_attr("n_steps", n_steps)

    net_arch = [
        {"pi": [64], "vf": [64]} if net_arch == "tiny" else {"pi": [64, 64], "vf": [64, 64]}
    ]

    activation_fn = {"tanh": nn.Tanh, "relu": nn.ReLU}[activation_fn]

    return {
        "n_steps": n_steps,
        "gamma": gamma,
        "gae_lambda": gae_lambda,
        "learning_rate": learning_rate,
        "ent_coef": ent_coef,
        "max_grad_norm": max_grad_norm,
        "policy_kwargs": {
            "net_arch": net_arch,
            "activation_fn": activation_fn,
            "ortho_init": ortho_init,
        },
    }


class TrialEvalCallback(EvalCallback):
    """Callback used for evaluating and reporting a trial."""

    def __init__(
        self,
        eval_env: gym.Env,
        trial: optuna.Trial,
        n_eval_episodes: int = 5,
        eval_freq: int = 10000,
        deterministic: bool = True,
        verbose: int = 0,
    ):

        super().__init__(
            eval_env=eval_env,
            n_eval_episodes=n_eval_episodes,
            eval_freq=eval_freq,
            deterministic=deterministic,
            verbose=verbose,
        )
        self.trial = trial
        self.eval_idx = 0
        self.is_pruned = False

    def _on_step(self) -> bool:
        if self.eval_freq > 0 and self.n_calls % self.eval_freq == 0:
            super()._on_step()
            self.eval_idx += 1
            self.trial.report(self.last_mean_reward, self.eval_idx)
            # Prune trial if need
            if self.trial.should_prune():
                self.is_pruned = True
                return False
        return True


def objective(trial: optuna.Trial) -> float:

    kwargs = DEFAULT_HYPERPARAMS.copy()
    # Sample hyperparameters
    kwargs.update(sample_a2c_params(trial))
    # Create the RL model
    model = A2C(**kwargs)
    # Create env used for evaluation
    eval_env = gym.make(ENV_ID)
    # Create the callback that will periodically evaluate
    # and report the performance
    eval_callback = TrialEvalCallback(
        eval_env, trial, n_eval_episodes=N_EVAL_EPISODES, eval_freq=EVAL_FREQ, deterministic=True
    )

    nan_encountered = False
    try:
        model.learn(N_TIMESTEPS, callback=eval_callback)
    except AssertionError as e:
        # Sometimes, random hyperparams can generate NaN
        print(e)
        nan_encountered = True
    finally:
        # Free memory
        model.env.close()
        eval_env.close()

    # Tell the optimizer that the trial failed
    if nan_encountered:
        return float("nan")

    if eval_callback.is_pruned:
        raise optuna.exceptions.TrialPruned()

    return eval_callback.last_mean_reward


if __name__ == "__main__":
    # Set pytorch num threads to 1 for faster training
    torch.set_num_threads(1)

    sampler = TPESampler(n_startup_trials=N_STARTUP_TRIALS)
    # Do not prune before 1/3 of the max budget is used
    pruner = MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_EVALUATIONS // 3)

    study = optuna.create_study(sampler=sampler, pruner=pruner, direction="maximize")
    try:
        study.optimize(objective, n_trials=N_TRIALS, timeout=600)
    except KeyboardInterrupt:
        pass

    print("Number of finished trials: ", len(study.trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

    print("  User attrs:")
    for key, value in trial.user_attrs.items():
        print("    {}: {}".format(key, value))

In [None]:
# code from https://github.com/DLR-RM/rl-baselines3-zoo/blob/master/utils/hyperparams_opt.py#L340

def sample_ddpg_params(trial: optuna.Trial) -> Dict[str, Any]:
    """
    Sampler for DDPG hyperparams.
    :param trial:
    :return:
    """
    gamma = trial.suggest_categorical("gamma", [0.9, 0.95, 0.98, 0.99, 0.995, 0.999, 0.9999])
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 1)
    batch_size = trial.suggest_categorical("batch_size", [16, 32, 64, 100, 128, 256, 512, 1024, 2048])
    buffer_size = trial.suggest_categorical("buffer_size", [int(1e4), int(1e5), int(1e6)])
    # Polyak coeff
    tau = trial.suggest_categorical("tau", [0.001, 0.005, 0.01, 0.02, 0.05, 0.08])

    train_freq = trial.suggest_categorical("train_freq", [1, 4, 8, 16, 32, 64, 128, 256, 512])
    gradient_steps = train_freq

    noise_type = trial.suggest_categorical("noise_type", ["ornstein-uhlenbeck", "normal", None])
    noise_std = trial.suggest_uniform("noise_std", 0, 1)

    # NOTE: Add "verybig" to net_arch when tuning HER (see TD3)
    net_arch = trial.suggest_categorical("net_arch", ["small", "medium", "big"])
    # activation_fn = trial.suggest_categorical('activation_fn', [nn.Tanh, nn.ReLU, nn.ELU, nn.LeakyReLU])

    net_arch = {
        "small": [64, 64],
        "medium": [256, 256],
        "big": [400, 300],
    }[net_arch]

    hyperparams = {
        "gamma": gamma,
        "tau": tau,
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "buffer_size": buffer_size,
        "train_freq": train_freq,
        "gradient_steps": gradient_steps,
        "policy_kwargs": dict(net_arch=net_arch),
    }

    if noise_type == "normal":
        hyperparams["action_noise"] = NormalActionNoise(
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
        )
    elif noise_type == "ornstein-uhlenbeck":
        hyperparams["action_noise"] = OrnsteinUhlenbeckActionNoise(
            mean=np.zeros(trial.n_actions), sigma=noise_std * np.ones(trial.n_actions)
        )

    if trial.using_her_replay_buffer:
        hyperparams = sample_her_params(trial, hyperparams)

    return hyperparams


In [None]:
!git clone --recursive https://github.com/DLR-RM/rl-baselines3-zoo

In [None]:
#!cd rl-baselines3-zoo/

In [None]:
!pip install -r rl-baselines3-zoo/requirements.txt

In [None]:
!python rl-baselines3-zoo/train.py --algo ddpg --env VPPBiddingEnv-TRAIN-v1 -n 697 -optimize --n-trials 5 --n-jobs -1 \
  --sampler tpe --pruner median

In [None]:
!python rl-baselines3-zoo/scripts/parse_study.py -i path/to/study.pkl --print-n-best-trials 10 --save-n-best-hyperparameters 10


### PPO - Proximal Policy Optimization algorithm 

#### Train the agent

#### Evaluate Agent

## A2C - synchronous, deterministic variant of Asynchronous Advantage Actor Critic (A3C)

#### Training

#### Eval

## Other Algorithm 

In [None]:
# todo

## DQN -- needs Discrete Action Space. 

# Testing

#### Run Episodes

### Check the Environment

# Keras 

## 2. Create a Deep Learning Model with Keras

## 3. Build Agent with Keras-RL


## 4. Reloading Agent from Memory


# Archive


### 2. create test list , check if date is in test list, if yes, skip day 

1. data set start date = 01.07.2020
2. training start date = 02.07.2020 
3. first slot lower boundary = 02.07.2020 22.00
4. make test set 
    - take time_features_df
    - substract 2 hours from each timestamp = start of slot 
    - iterate over df and get date every 5 days, add to list = test list. 
5. in training mode -> skip dates in list. 
6. in test mode -> take only dates from test list. 
7. unterschiedliche testsets erstellen? izzy meinte, zusammenhängende woche wäre gut (seasonality)

Vorgehen:
Ab der ersten Vollen Woche: Woche nehmen und Testset-Liste hinzufügen, 
dann skip 5 wochen , dann 1 woche test woche 


### scaler for observations


In [None]:
# scaler for observations

scaler = MinMaxScaler(feature_range=(-1,1))

a_raw = asset_data_historic
print("a_raw")

print(a_raw)

scaler.fit(np.array(a_raw).reshape(-1, 1))

b_transformed = scaler.transform((a_raw.reshape(-1, 1)))
print("b_transformed")

print(b_transformed)

# convert from array to list
c_list = [x for xs in list(b_transformed) for x in xs]
print("c_list")

print(c_list)

# transform back to 

d_transformed_back = (scaler.inverse_transform(np.array(c_list).reshape(-1, 1)))
print("d_transformed_back")
print(d_transformed_back)

print("e_array")
e_array = d_transformed_back.flatten()
print(e_array)


print("f_list")
f_list = [x for xs in list(d_transformed_back) for x in xs]

print(f_list)