In [1]:
!pip install stable-baselines3==2.0.0 
!pip install numpy
!pip install tensorflow
%load_ext tensorboard
import tensorflow as tf



In [2]:
import numpy as np
import sys
import matplotlib.pyplot as plt
from IPython.display import clear_output
from stable_baselines3 import DQN
import requests

In [3]:
clear_output(wait=True)

In [4]:
url_api = 'https://api.boptest.net'
url = "http://localhost:80"

In [5]:
sys.path.insert(0,'boptestGym')
from boptestGymEnv import BoptestGymEnv

In [6]:
class BoptestGymEnvCustomRewardEnerBrain(BoptestGymEnv):
    def get_reward(self):
        '''
        'tdis_tot': temp discomfort
         'idis_tot': 0,
         'ener_tot': total energy
         'cost_tot': total cost
         'emis_tot': total emission
         'pele_tot': defines the HVAC peak electrical demand.
         'pgas_tot': defines the HVAC peak gas demand.
         'pdih_tot': defines the HVAC peak district heating demand.
         'time_rat': defines the average ratio between the controller computation time and the test simulation control step. 
         The controller computation time is measured as the time between two emulator advances
        '''
        # Compute BOPTEST core kpis
        kpis = requests.get('{0}/kpi/{1}'.format(self.url, self.testid)).json()['payload']

        ener_rew = 0
        if 0< kpis['ener_tot'] <1:
            ener_rew = 1 - (kpis['ener_tot']*kpis['ener_tot'])
        else:
            ener_rew = - kpis['ener_tot'] * kpis['ener_tot']
        # todo: search for best reward function
        reward = - (kpis['tdis_tot'] / 1000) + ener_rew

        # Record current objective integrand for next evaluation
        self.objective_integrand = reward
        return reward

In [7]:
class BoptestGymEnvCustomReward(BoptestGymEnv):
    def get_reward(self):
        # Compute BOPTEST core kpis
        kpis = requests.get('{0}/kpi/{1}'.format(self.url, self.testid)).json()['payload']

        ener_coef = 1.0
        temp_coef = 1.0
        
        # todo: search for best reward function
        reward = - ((kpis['tdis_tot']*temp_coef) + (kpis['ener_tot']*ener_coef))

        # Record current objective integrand for next evaluation
        self.objective_integrand = reward
        return reward

In [8]:
import gymnasium as gym
from gymnasium.spaces import Box

class AddPriceWrapper(gym.ObservationWrapper):
    def __init__(self, env, price_value: float = 1.0):
        super().__init__(env)
        self.price_value = float(price_value)

        old_space: Box = env.observation_space
        assert isinstance(old_space, Box), f"Expected Box, got {type(old_space)}"

        # Extend low/high by one dimension for the price feature
        low  = np.concatenate([old_space.low,  np.array([0.0], dtype=np.float32)])
        high = np.concatenate([old_space.high, np.array([10.0], dtype=np.float32)])

        self.observation_space = Box(
            low=low,
            high=high,
            dtype=np.float32
        )

    def observation(self, obs: np.ndarray) -> np.ndarray:
        """
        This is called automatically by ObservationWrapper on both reset() and step().
        """
        price = np.array([self.price_value], dtype=np.float32)
        return np.concatenate([obs, price], axis=-1)


In [9]:
feb15 = 47 * 24*3600                    # Jan 1 → Feb 15
episode_length = 7 * 24*3600            # 1 week episodes
max_start = feb15 - episode_length      # last valid start time

In [10]:
print(type(feb15))
print(type(episode_length))
print(type(max_start))

<class 'int'>
<class 'int'>
<class 'int'>


In [11]:
# Instantiate environment
env = BoptestGymEnvCustomReward(url       = url,
                    testcase              = 'bestest_hydronic_heat_pump',
                    actions               = ['oveHeaPumY_u'],
                    observations          = {'reaTZon_y':(280.,310.),
                                             'weaSta_reaWeaTDryBul_y':(265.,303.),
                                             'weaSta_reaWeaHDirNor_y':(0.,862.)
                                            },
                    random_start_time     = True,
                    start_time            = 1*24*3600,
                    max_episode_length    = 7 * 24*3600 ,
                    excluding_periods     = [(max_start, 365*24*3600)],
                    warmup_period         = 24*3600,
                    predictive_period     = 0,
                    regressive_period     = 4*1800,
                    step_period           = 1800)

  gym.logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


In [12]:
env = AddPriceWrapper(env, price_value=1.0)

In [13]:
from boptestGymEnv import NormalizedObservationWrapper
from boptestGymEnv import DiscretizedActionWrapper
env = NormalizedObservationWrapper(env)
env = DiscretizedActionWrapper(env,n_bins_act=10)

In [14]:
import os
log_path = os.path.join( "local_files", "Logs")

In [15]:
import torch

In [16]:
policy_kwargs = dict(
    net_arch=[64, 8],  
    activation_fn=torch.nn.ReLU
)

model = DQN('MlpPolicy',
            env,
            verbose=1,
            gamma=0.99,
            learning_rate=0.01,
            batch_size=64,
            buffer_size=20000,
            learning_starts=24,
            train_freq=1,
            target_update_interval=1000,
            tau=1.0,
            gradient_steps=1,
            exploration_fraction=0.1,          #ε-greedy?
            exploration_initial_eps=1.0,
            exploration_final_eps=0.05,
            policy_kwargs=policy_kwargs,
            tensorboard_log= log_path)

Using cuda device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.


In [17]:
episodes_no = 25 #can be 25 or 50 as the paper 
total_timesteps = 336 * episodes_no #336 = episode_length / step_period (604800 / 1800 = 336)
model.learn(total_timesteps= total_timesteps) 

Logging to local_files\Logs\DQN_8
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 336      |
|    ep_rew_mean      | -1.1e+04 |
|    exploration_rate | 0.05     |
| time/               |          |
|    episodes         | 4        |
|    fps              | 12       |
|    time_elapsed     | 110      |
|    total_timesteps  | 1344     |
| train/              |          |
|    learning_rate    | 0.01     |
|    loss             | 8.56     |
|    n_updates        | 1319     |
----------------------------------
-----------------------------------
| rollout/            |           |
|    ep_len_mean      | 336       |
|    ep_rew_mean      | -1.78e+04 |
|    exploration_rate | 0.05      |
| time/               |           |
|    episodes         | 8         |
|    fps              | 12        |
|    time_elapsed     | 215       |
|    total_timesteps  | 2688      |
| train/              |           |
|    learning_rate    | 0.01      |
|    loss

<stable_baselines3.dqn.dqn.DQN at 0x1bac2682f60>

In [18]:
model.save("dqn_baseline")

In [19]:
env.stop()

In [61]:
%tensorboard --logdir ./local_files/Logs --port 6008

kill: 16956: No such process


In [21]:
buffer_path = os.path.join( "local_files", "Buffer")
model.save_replay_buffer(buffer_path)

In [51]:
import pickle
import pandas as pd

with open("local_files/Buffer.pkl", "rb") as f:
    buf = pickle.load(f)


obs = buf.observations.reshape(len(buf.observations), -1)
next_obs = buf.next_observations.reshape(len(buf.next_observations), -1)
actions = buf.actions.reshape(len(buf.actions), -1)
rewards = buf.rewards.reshape(len(buf.rewards), -1)
dones = buf.dones.reshape(len(buf.dones), -1)

# Build dataframe
df = pd.DataFrame(
    np.hstack([obs, actions, rewards, dones, next_obs]),
    columns=[
        *[f"obs_{i}" for i in range(obs.shape[1])],
        "action",
        "reward",
        "done",
        *[f"next_obs_{i}" for i in range(next_obs.shape[1])]
    ]
)

# Save CSV
csv_path = "local_files/buffer_export.csv"
df.to_csv(csv_path, index=False)