### Training of a simple policy using the custom reward function

In [30]:
import numpy as np 
import pandas as pd

import math
import sys
import os

In [31]:
import gym
gym.__version__

'0.21.0'

In [32]:
from citylearn.citylearn import CityLearnEnv

In [33]:
from citylearn.wrappers import NormalizedObservationWrapper, StableBaselines3Wrapper

In [34]:
from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback

In [35]:
from custom_reward import CustomReward

#### Create the environment

A function and a wrapper class as given in the local evaluation script provided by the challenge

In [36]:
class WrapperEnv:
    """
    Env to wrap provide Citylearn Env data without providing full env
    Preventing attribute access outside of the available functions
    """
    def __init__(self, env_data):
        self.observation_names = env_data['observation_names']
        self.action_names = env_data['action_names']
        self.observation_space = env_data['observation_space']
        self.action_space = env_data['action_space']
        self.time_steps = env_data['time_steps']
        self.seconds_per_time_step = env_data['seconds_per_time_step']
        self.random_seed = env_data['random_seed']
        self.buildings_metadata = env_data['buildings_metadata']
        self.episode_tracker = env_data['episode_tracker']
    
    def get_metadata(self):
        return {'buildings': self.buildings_metadata}

def create_citylearn_env(schema_path, reward_function, central_agent):
    env = CityLearnEnv(schema=schema_path, reward_function=reward_function, central_agent=central_agent)

    env_data = dict(
        observation_names = env.observation_names,
        action_names = env.action_names,
        observation_space = env.observation_space,
        action_space = env.action_space,
        time_steps = env.time_steps,
        random_seed = None,
        episode_tracker = None,
        seconds_per_time_step = None,
        buildings_metadata = env.get_metadata()['buildings']
    )

    wrapper_env = WrapperEnv(env_data)
    return env, wrapper_env

create environment

In [37]:
schema_path = os.path.join("./data/", "schema.json")

env, wrapper_env = create_citylearn_env(schema_path, CustomReward, True)

In [38]:
# env.get_metadata()
# env.reward_function.env_metadata

Prepare for SB3

In [39]:
env = NormalizedObservationWrapper(env)
env = StableBaselines3Wrapper(env)

Create SAC model

In [40]:
# model = SAC("MlpPolicy", env, tensorboard_log="./tensorboard_logs/")

model = SAC.load("models/custom_reward_SAC6.zip")
model.set_env(env)

Create custom callback to track reward values

In [41]:
class CustomCallback(BaseCallback):
    """
    Custom callback for plotting additional reward values in tensorboard
    """
    def __init__(self, env, verbose = 0):
        super().__init__(verbose)
        self.env = env
        self.reset()
        
    def reset(self):
        self.comfort = 0.
        self.emissions = 0.
        self.grid = 0.
        self.resilience = 0.
        self.u = 0.
        self.g = 0.
        self.r = 0.
        self.d = 0.
        self.l = 0.
        self.a = 0.
        self.m = 0.
        self.s = 0.

    def _on_rollout_end(self) -> None:
        self.logger.record("comfort", self.comfort)
        self.logger.record("emissions", self.emissions)
        self.logger.record("grid", self.grid)
        self.logger.record("resilience", self.resilience)
        self.logger.record("u", self.u)
        self.logger.record("g", self.g)
        self.logger.record("r", self.r)
        self.logger.record("d", self.d)
        self.logger.record("l", self.l)
        self.logger.record("a", self.a)
        self.logger.record("m", self.m)
        self.logger.record("s", self.s)
        self.reset()

    def _on_step(self) -> bool:
        self.comfort += self.env.reward_function.comfort
        self.emissions += self.env.reward_function.emissions
        self.grid += self.env.reward_function.grid
        self.resilience += self.env.reward_function.resilience
        self.u += self.env.reward_function.u
        self.g += self.env.reward_function.g
        self.r += self.env.reward_function.r
        self.d += self.env.reward_function.d
        self.l += self.env.reward_function.l
        self.a += self.env.reward_function.a
        self.m += self.env.reward_function.m
        self.s += self.env.reward_function.s

        return True


Train

In [42]:
# model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 20, 
#             log_interval = 1)
# # model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 20, 
# #             log_interval = 1,
# #             callback = CustomCallback(env))
# model.save("models/custom_reward_SAC")

Evaluate 

In [43]:
observations = env.reset()

while not env.done:
    actions, _ = model.predict(observations, deterministic=True)
    observations, _, _, _ = env.step(actions)

kpis = env.evaluate()
kpis = kpis.pivot(index='cost_function', columns='name', values='value')
kpis = kpis.dropna(how='all')
display(kpis)

name,Building_1,Building_2,Building_3,District
cost_function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
annual_normalized_unserved_energy_total,0.044407,0.027406,0.037853,0.036555
annual_peak_average,,,,0.950379
carbon_emissions_total,0.830986,1.171531,0.883571,0.962029
cost_total,0.813885,1.16412,0.855781,0.944595
daily_one_minus_load_factor_average,,,,0.741901
daily_peak_average,,,,0.735694
discomfort_delta_average,-0.074267,-0.16001,0.275628,0.013784
discomfort_delta_maximum,5.654882,5.819633,4.061457,5.178658
discomfort_delta_minimum,-4.684221,-11.568905,-2.64386,-6.298995
discomfort_proportion,0.014025,0.117537,0.004992,0.045518
