### Training of a simple policy using the custom reward function

In [1]:
import numpy as np 
import pandas as pd

import math
import sys
import os

In [2]:
import gym
gym.__version__

'0.21.0'

In [3]:
from citylearn.citylearn import CityLearnEnv

In [4]:
from citylearn.wrappers import NormalizedObservationWrapper, StableBaselines3Wrapper

In [5]:
from stable_baselines3 import SAC

2024-01-02 00:47:01.416649: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [6]:
from data.schemas.warm_up.custom_reward import CustomReward

#### Create the environment

A function and a wrapper class as given in the local evaluation script provided by the challenge

In [7]:
class WrapperEnv:
    """
    Env to wrap provide Citylearn Env data without providing full env
    Preventing attribute access outside of the available functions
    """
    def __init__(self, env_data):
        self.observation_names = env_data['observation_names']
        self.action_names = env_data['action_names']
        self.observation_space = env_data['observation_space']
        self.action_space = env_data['action_space']
        self.time_steps = env_data['time_steps']
        self.seconds_per_time_step = env_data['seconds_per_time_step']
        self.random_seed = env_data['random_seed']
        self.buildings_metadata = env_data['buildings_metadata']
        self.episode_tracker = env_data['episode_tracker']
    
    def get_metadata(self):
        return {'buildings': self.buildings_metadata}

def create_citylearn_env(schema_path, reward_function, central_agent):
    env = CityLearnEnv(schema=schema_path, reward_function=reward_function, central_agent=central_agent)

    env_data = dict(
        observation_names = env.observation_names,
        action_names = env.action_names,
        observation_space = env.observation_space,
        action_space = env.action_space,
        time_steps = env.time_steps,
        random_seed = None,
        episode_tracker = None,
        seconds_per_time_step = None,
        buildings_metadata = env.get_metadata()['buildings']
    )

    wrapper_env = WrapperEnv(env_data)
    return env, wrapper_env

create environment

In [8]:
schema_path = os.path.join("./data/", "schemas/warm_up/schema.json")

env, wrapper_env = create_citylearn_env(schema_path, CustomReward, True)

In [9]:
# env.get_metadata()
# env.reward_function.env_metadata

Prepare for SB3

In [10]:
env = NormalizedObservationWrapper(env)
env = StableBaselines3Wrapper(env)

Create SAC model

In [11]:
model = SAC("MlpPolicy", env, tensorboard_log="./tensorboard_logs/")
model.load("custom_reward_SAC1.zip")
model.set_env(env)



Train

In [12]:
# model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 1, 
            # log_interval = 1)

Evaluate (20 epoch training)

In [13]:
# observations = env.reset()

# while not env.done:
#     actions, _ = model.predict(observations, deterministic=True)
#     observations, _, _, _ = env.step(actions)

# kpis = env.evaluate()
# kpis = kpis.pivot(index='cost_function', columns='name', values='value')
# kpis = kpis.dropna(how='all')
# display(kpis)

Train

In [14]:
model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 2000, 
            log_interval = 1)
model.save("custom_reward_SAC")

Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor
Load factor


Evaluate 

In [15]:
observations = env.reset()

while not env.done:
    actions, _ = model.predict(observations, deterministic=True)
    observations, _, _, _ = env.step(actions)

kpis = env.evaluate()
kpis = kpis.pivot(index='cost_function', columns='name', values='value')
kpis = kpis.dropna(how='all')
display(kpis)

name,Building_1,Building_2,Building_3,District
cost_function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
annual_normalized_unserved_energy_total,0.035915,0.027091,0.045253,0.036086
annual_peak_average,,,,0.925765
carbon_emissions_total,0.829838,1.223197,0.925802,0.992945
cost_total,0.811263,1.245534,0.893074,0.98329
daily_one_minus_load_factor_average,,,,0.882342
daily_peak_average,,,,0.861269
discomfort_delta_average,0.164135,-0.654444,-0.015084,-0.168465
discomfort_delta_maximum,5.646681,5.831661,3.918825,5.132389
discomfort_delta_minimum,-8.509087,-11.713064,-3.511442,-7.911198
discomfort_proportion,0.079944,0.26306,0.033278,0.125427


In [16]:
model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 500, 
            log_interval = 1)
model.save("custom_reward_SAC")

In [None]:
model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 500, 
            log_interval = 1)
model.save("custom_reward_SAC3")

In [None]:
model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 500, 
            log_interval = 1)
model.save("custom_reward_SAC4")

In [None]:
model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 500, 
            log_interval = 1)
model.save("custom_reward_SAC5")

In [None]:
model.learn(total_timesteps = env.get_metadata()["simulation_time_steps"] * 500, 
            log_interval = 1)
model.save("custom_reward_SAC6")