### Training of a simple policy using the custom reward function

In [1]:
import numpy as np 
import pandas as pd

import math
import sys
import os

In [2]:
import gym
gym.__version__

'0.21.0'

In [3]:
from citylearn.citylearn import CityLearnEnv

In [4]:
from citylearn.wrappers import NormalizedObservationWrapper, StableBaselines3Wrapper

In [5]:
from stable_baselines3 import SAC

2023-12-27 22:08:00.228805: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from data.schemas.warm_up.custom_reward_relaxed import CustomReward

#### Create the environment

A function and a wrapper class as given in the local evaluation script provided by the challenge

In [7]:
class WrapperEnv:
    """
    Env to wrap provide Citylearn Env data without providing full env
    Preventing attribute access outside of the available functions
    """
    def __init__(self, env_data):
        self.observation_names = env_data['observation_names']
        self.action_names = env_data['action_names']
        self.observation_space = env_data['observation_space']
        self.action_space = env_data['action_space']
        self.time_steps = env_data['time_steps']
        self.seconds_per_time_step = env_data['seconds_per_time_step']
        self.random_seed = env_data['random_seed']
        self.buildings_metadata = env_data['buildings_metadata']
        self.episode_tracker = env_data['episode_tracker']
    
    def get_metadata(self):
        return {'buildings': self.buildings_metadata}

def create_citylearn_env(schema_path, reward_function, central_agent):
    env = CityLearnEnv(schema=schema_path, reward_function=reward_function, central_agent=central_agent)

    env_data = dict(
        observation_names = env.observation_names,
        action_names = env.action_names,
        observation_space = env.observation_space,
        action_space = env.action_space,
        time_steps = env.time_steps,
        random_seed = None,
        episode_tracker = None,
        seconds_per_time_step = None,
        buildings_metadata = env.get_metadata()['buildings']
    )

    wrapper_env = WrapperEnv(env_data)
    return env, wrapper_env

create environment

In [8]:
schema_path = os.path.join("./data/", "schemas/warm_up/schema.json")

env, wrapper_env = create_citylearn_env(schema_path, CustomReward, True)

In [9]:
# env.get_metadata()
# env.reward_function.env_metadata

Prepare for SB3

In [10]:
env = NormalizedObservationWrapper(env)
env = StableBaselines3Wrapper(env)

Create SAC model

In [11]:
model = SAC("MlpPolicy", env, tensorboard_log="./tensorboard_logs/")



Train

In [12]:
model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 20, 
            log_interval = 1)

<stable_baselines3.sac.sac.SAC at 0x7fe70c805e10>

Evaluate (20 epoch training)

In [13]:
observations = env.reset()

while not env.done:
    actions, _ = model.predict(observations, deterministic=True)
    observations, _, _, _ = env.step(actions)

kpis = env.evaluate()
kpis = kpis.pivot(index='cost_function', columns='name', values='value')
kpis = kpis.dropna(how='all')
display(kpis)

name,Building_1,Building_2,Building_3,District
cost_function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
annual_normalized_unserved_energy_total,0.056063,0.01463,0.04171,0.037468
annual_peak_average,,,,0.868995
carbon_emissions_total,0.635524,0.926807,0.858671,0.807001
cost_total,0.610627,0.932555,0.813487,0.785556
daily_one_minus_load_factor_average,,,,1.111224
daily_peak_average,,,,0.948848
discomfort_delta_average,2.692957,0.354607,-0.039945,1.00254
discomfort_delta_maximum,10.239315,9.504707,5.686996,8.477006
discomfort_delta_minimum,-8.649389,-8.004427,-3.987715,-6.88051
discomfort_proportion,0.723703,0.54291,0.299501,0.522038


Train for 500 epochs

In [14]:
model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 500, 
            log_interval = 1)

Evaluate 500 epochs

In [None]:
observations = env.reset()

while not env.done:
    actions, _ = model.predict(observations, deterministic=True)
    observations, _, _, _ = env.step(actions)

kpis = env.evaluate()
kpis = kpis.pivot(index='cost_function', columns='name', values='value')
kpis = kpis.dropna(how='all')
display(kpis)

name,Building_1,Building_2,Building_3,District
cost_function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
annual_normalized_unserved_energy_total,0.036226,0.054753,0.027661,0.039547
annual_peak_average,,,,0.763585
carbon_emissions_total,0.36432,0.397953,0.488159,0.416811
cost_total,0.350832,0.378128,0.475645,0.401535
daily_one_minus_load_factor_average,,,,1.205745
daily_peak_average,,,,0.590696
discomfort_delta_average,9.248469,6.948974,7.874515,8.023986
discomfort_delta_maximum,16.685581,14.765364,13.928785,15.126577
discomfort_delta_minimum,-0.123569,-0.581318,-0.37221,-0.359032
discomfort_proportion,0.981767,0.977612,0.978369,0.97925
