### Training of a simple policy using the custom reward function

In [1]:
import numpy as np 
import pandas as pd

import math
import sys
import os

In [2]:
import gym
gym.__version__

'0.21.0'

In [3]:
from citylearn.citylearn import CityLearnEnv

In [4]:
from citylearn.wrappers import NormalizedObservationWrapper, StableBaselines3Wrapper

In [5]:
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat

In [6]:
from custom_reward import CustomReward

#### Create the environment

A function and a wrapper class as given in the local evaluation script provided by the challenge

In [7]:
class WrapperEnv:
    """
    Env to wrap provide Citylearn Env data without providing full env
    Preventing attribute access outside of the available functions
    """
    def __init__(self, env_data):
        self.observation_names = env_data['observation_names']
        self.action_names = env_data['action_names']
        self.observation_space = env_data['observation_space']
        self.action_space = env_data['action_space']
        self.time_steps = env_data['time_steps']
        self.seconds_per_time_step = env_data['seconds_per_time_step']
        self.random_seed = env_data['random_seed']
        self.buildings_metadata = env_data['buildings_metadata']
        self.episode_tracker = env_data['episode_tracker']
    
    def get_metadata(self):
        return {'buildings': self.buildings_metadata}

def create_citylearn_env(schema_path, reward_function, central_agent):
    env = CityLearnEnv(schema=schema_path, reward_function=reward_function, central_agent=central_agent)

    env_data = dict(
        observation_names = env.observation_names,
        action_names = env.action_names,
        observation_space = env.observation_space,
        action_space = env.action_space,
        time_steps = env.time_steps,
        random_seed = None,
        episode_tracker = None,
        seconds_per_time_step = None,
        buildings_metadata = env.get_metadata()['buildings']
    )

    wrapper_env = WrapperEnv(env_data)
    return env, wrapper_env

create environment

In [8]:
schema_path = os.path.join("./data/", "schema.json")

env, wrapper_env = create_citylearn_env(schema_path, CustomReward, True)

In [9]:
# env.get_metadata()
# env.reward_function.env_metadata

Prepare for SB3

In [10]:
env = NormalizedObservationWrapper(env)
env = StableBaselines3Wrapper(env)

Create SAC model

In [11]:
# model = SAC("MlpPolicy", env, tensorboard_log="./tensorboard_logs/")

model = SAC.load("models/custom_reward_SAC6.zip")
model.set_env(env)

  return torch._C._cuda_getDeviceCount() > 0




Create custom callback to track reward values

In [12]:
class CustomCallback(BaseCallback):
    """
    Custom callback for plotting additional reward values in tensorboard
    """
    def __init__(self, verbose = 0):
        super().__init__(verbose)

    def _on_rollout_end(self) -> None:
        # print("---------------------------------------------------")
        # print("comfort", self.training_env.get_attr("reward_function")[0].comfort[0])
        # print("emissions", self.training_env.get_attr("reward_function")[0].emissions[0])
        # print("grid", self.training_env.get_attr("reward_function")[0].grid[0])
        # print("resilience", self.training_env.get_attr("reward_function")[0].resilience[0])
        # print("u", self.training_env.get_attr("reward_function")[0].u[0])
        # print("g", self.training_env.get_attr("reward_function")[0].g[0])
        # print("r", self.training_env.get_attr("reward_function")[0].r[0])
        # print("d", self.training_env.get_attr("reward_function")[0].d[0])
        # print("l", self.training_env.get_attr("reward_function")[0].l[0])
        # print("a", self.training_env.get_attr("reward_function")[0].a[0])
        # print("m", self.training_env.get_attr("reward_function")[0].m[0])
        # print("s", self.training_env.get_attr("reward_function")[0].s[0])

        self.logger.record("comfort", -self.training_env.get_attr("reward_function")[0].comfort[0])
        self.logger.record("emissions", -self.training_env.get_attr("reward_function")[0].emissions[0])
        self.logger.record("grid", -self.training_env.get_attr("reward_function")[0].grid[0])
        self.logger.record("resilience", -self.training_env.get_attr("reward_function")[0].resilience[0])
        self.logger.record("u", -self.training_env.get_attr("reward_function")[0].u[0])
        self.logger.record("g", -self.training_env.get_attr("reward_function")[0].g[0])
        self.logger.record("r", -self.training_env.get_attr("reward_function")[0].r[0])
        self.logger.record("d", -self.training_env.get_attr("reward_function")[0].d[0])
        self.logger.record("l", -self.training_env.get_attr("reward_function")[0].l[0])
        self.logger.record("a", -self.training_env.get_attr("reward_function")[0].a[0])
        self.logger.record("m", -self.training_env.get_attr("reward_function")[0].m[0])
        self.logger.record("s", -self.training_env.get_attr("reward_function")[0].s[0])
        
        # self.u += self.training_env.get_attr("reward_function")[0].u[0]
        # self.g += self.training_env.get_attr("reward_function")[0].g[0]
        # self.r += self.training_env.get_attr("reward_function")[0].r[0]
        # self.d += self.training_env.get_attr("reward_function")[0].d[0]
        # self.l += self.training_env.get_attr("reward_function")[0].l[0]
        # self.a += self.training_env.get_attr("reward_function")[0].a[0]
        # self.m += self.training_env.get_attr("reward_function")[0].m[0]
        # self.s += self.training_env.get_attr("reward_function")[0].s[0]

    def _on_step(self) -> bool:
        # print(self.training_env.get_attr("reward_function")[0])
        # self.comfort += self.training_env.get_attr("reward_function")[0].comfort[0]
        # self.emissions += self.training_env.get_attr("reward_function")[0].emissions[0]
        # self.grid += self.training_env.get_attr("reward_function")[0].grid[0]
        # self.resilience += self.training_env.get_attr("reward_function")[0].resilience[0]
        # self.u += self.training_env.get_attr("reward_function")[0].u[0]
        # self.g += self.training_env.get_attr("reward_function")[0].g[0]
        # self.r += self.training_env.get_attr("reward_function")[0].r[0]
        # self.d += self.training_env.get_attr("reward_function")[0].d[0]
        # self.l += self.training_env.get_attr("reward_function")[0].l[0]
        # self.a += self.training_env.get_attr("reward_function")[0].a[0]
        # self.m += self.training_env.get_attr("reward_function")[0].m[0]
        # self.s += self.training_env.get_attr("reward_function")[0].s[0]

        return True


Train

In [13]:
# model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 20, 
#             log_interval = 1)
model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 30, 
            log_interval = 1,
            callback = CustomCallback())
# model.save("models/custom_reward_SAC")

  logger.warn(


---------------------------------------------------
comfort 0.0
emissions 0.19550224846690256
grid 0.4904967428438785
resilience 0.0
u 0.0
g 1.9550224846690254
r 0.0
d 0.06516741374190983
l 1.0
a 5.474789157509804
m 0.0
s 0.0
---------------------------------------------------
comfort 0.0
emissions 0.38423308956286495
grid 0.7722571365086119
resilience 0.0
u 0.0
g 3.842330895628649
r 2.727336734468226
d 0.06516741374190983
l 2.029468514394887
a 5.474789157509804
m 0.0
s 0.0
---------------------------------------------------
comfort 0.0
emissions 0.6278069031945772
grid 1.1266457410227924
resilience 0.0
u 0.0
g 6.278069031945773
r 6.278397029684458
d 0.06516741374190983
l 2.981717059471728
a 5.696661710739136
m 0.0
s 0.0
---------------------------------------------------
comfort 0.0
emissions 0.7099372974252057
grid 2.7792146307143737
resilience 0.0
u 0.0
g 7.0993729742520575
r 27.043211119814927
d 0.06516741374190983
l 4.251154831895683
a 5.696661710739136
m 0.0
s 0.0
---------------

KeyboardInterrupt: 

Evaluate 

In [None]:
observations = env.reset()

while not env.done:
    actions, _ = model.predict(observations, deterministic=True)
    observations, _, _, _ = env.step(actions)

kpis = env.evaluate()
kpis = kpis.pivot(index='cost_function', columns='name', values='value')
kpis = kpis.dropna(how='all')
display(kpis)