In [1]:
!pip install stable-baselines3==2.0.0 
!pip install numpy
!pip install tensorflow
%load_ext tensorboard
import tensorflow as tf



In [2]:
import numpy as np
import sys
import matplotlib.pyplot as plt
from IPython.display import clear_output
from stable_baselines3 import DQN
import requests

In [3]:
clear_output(wait=True)

In [4]:
url_api = 'https://api.boptest.net'
url = "http://localhost:80"

In [5]:
sys.path.insert(0,'boptestGym')
from boptestGymEnv import BoptestGymEnv

In [7]:
class BoptestGymEnvCustomReward(BoptestGymEnv):
    def get_reward(self):
        # Compute BOPTEST core kpis
        kpis = requests.get('{0}/kpi/{1}'.format(self.url, self.testid)).json()['payload']

        ener_coef = 1.0
        temp_coef = 1.0
        
        # todo: search for best reward function
        reward = - ((kpis['tdis_tot']*temp_coef) + (kpis['ener_tot']*ener_coef))

        # Record current objective integrand for next evaluation
        self.objective_integrand = reward
        return reward

In [8]:
feb15 = 47 * 24*3600                    # Jan 1 → Feb 15
episode_length = 7 * 24*3600            # 1 week episodes
max_start = feb15 - episode_length      # last valid start time

In [9]:
# Instantiate environment
env = BoptestGymEnvCustomReward(url       = url,
                    testcase              = 'bestest_hydronic_heat_pump',
                    actions               = ['oveTSet_u'],
                    observations          = {'reaTZon_y':(280.,310.),
                                             'weaSta_reaWeaTDryBul_y':(265.,303.),
                                             'weaSta_reaWeaHDirNor_y':(0.,862.)
                                            },
                    random_start_time     = True,
                    start_time            = 1*24*3600,
                    max_episode_length    = 7 * 24*3600 ,
                    excluding_periods     = [(max_start, 365*24*3600)],
                    warmup_period         = 24*3600,
                    predictive_period     = 0,
                    regressive_period     = 4*1800,
                    step_period           = 1800)

  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [18]:
from boptestGymEnv import NormalizedObservationWrapper
from boptestGymEnv import DiscretizedActionWrapper
env = NormalizedObservationWrapper(env)
env = DiscretizedActionWrapper(env,n_bins_act=2)

In [20]:
import os
log_path = os.path.join( "local_files", "Logs")

In [22]:
import torch

In [24]:
policy_kwargs = dict(
    net_arch=[64, 8],  
    activation_fn=torch.nn.ReLU
)

model = DQN('MlpPolicy',
            env,
            verbose=1,
            gamma=0.99,
            learning_rate=0.01,
            batch_size=64,
            buffer_size=20000,
            learning_starts=0,
            train_freq=1,
            target_update_interval=1000,
            tau=1.0,
            gradient_steps=1,
            exploration_fraction=0.1,          #ε-greedy?
            exploration_initial_eps=1.0,
            exploration_final_eps=0.05,
            policy_kwargs=policy_kwargs,
            tensorboard_log= log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [26]:
episodes_no = 25 #can be 25 or 50 as the paper 
total_timesteps = 336 * episodes_no #336 = episode_length / step_period (604800 / 1800 = 336)
model.learn(total_timesteps= total_timesteps) 

Logging to local_files\Logs\DQN_9
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 336       |
|    ep_rew_mean      | -2.17e+04 |
|    exploration_rate | 0.05      |
| time/               |           |
|    episodes         | 4         |
|    fps              | 12        |
|    time_elapsed     | 105       |
|    total_timesteps  | 1344      |
| train/              |           |
|    learning_rate    | 0.01      |
|    loss             | 70.3      |
|    n_updates        | 1319      |
-----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 336       |
|    ep_rew_mean      | -3.32e+04 |
|    exploration_rate | 0.05      |
| time/               |           |
|    episodes         | 8         |
|    fps              | 13        |
|    time_elapsed     | 206       |
|    total_timesteps  | 2688      |
| train/              |           |
|    learning_rate    | 0.01  

<stable_baselines3.dqn.dqn.DQN at 0x2d78eb480b0>

In [28]:
model.save("dqn_baseline")

In [30]:
env.stop()

In [34]:
%tensorboard --logdir ./local_files/Logs --port 6007