# Comparison of SB3 Soft Actor-Critic CTCE trained agents with different reward functions
### Trained for 3.960.000 steps (720 (episode) x 5500)

In [75]:
import numpy as np 
import pandas as pd

import math
import sys
import os

import gym
gym.__version__

from citylearn.citylearn import CityLearnEnv
from citylearn.wrappers import NormalizedObservationWrapper, StableBaselines3Wrapper

from stable_baselines3 import SAC
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat

from training_simple_policy.custom_reward import CustomReward

import seaborn as sns
import matplotlib.pyplot as plt

### Initialize environment

In [26]:
def makeEnv():# schema path
    schema_path = os.path.join("./training_simple_policy/data/", "schema_edited.json")

    # create environment
    env = CityLearnEnv(schema=schema_path, reward_function=CustomReward, central_agent=True)

    # wrap environment for use in stablebaselines3
    env = NormalizedObservationWrapper(env)
    env = StableBaselines3Wrapper(env)

    return env

env = makeEnv()

### Load models

In [27]:
# Given reward functions
model_energy_reward = SAC.load("habrok_training_results/training_energy_reward3/models/energy_reward_SAC.zip")
model_energy_reward.set_env(env)

model_comfort_reward = SAC.load("habrok_training_results/training_comfort_reward3/models/comfort_reward_SAC.zip")
model_comfort_reward.set_env(env)

model_solar_reward = SAC.load("habrok_training_results/training_solar_reward3/models/solar_reward_SAC.zip")
model_solar_reward.set_env(env)

model_solarcomfort_reward = SAC.load("habrok_training_results/training_solarcomfort_reward3/models/solarcomfort_reward_SAC.zip")
model_solarcomfort_reward.set_env(env)

# Custom reward function
# model_custom_reward = SAC.load("habrok_training_results/training_comfort_reward3/models/custom_reward_SAC.zip")
# model_custom_reward.set_env(env)



### Evaluate

In [64]:
def evaluate(model, env, name):
    observations = env.reset()

    while not env.done:
        actions, _ = model.predict(observations, deterministic=True)
        observations, _, _, _ = env.step(actions)
    
    kpis = env.evaluate()

    kpis = kpis.pivot(index='cost_function', columns='name', values='value')
    kpis = kpis.dropna(how='all')
    kpis.insert(0, "Reward function", name)

    return kpis

In [65]:
kpis_energy = evaluate(model_energy_reward, env, "Energy reward")
kpis_comfort = evaluate(model_comfort_reward, env, "Comfort reward")
kpis_solar = evaluate(model_solar_reward, env, "Solar reward")
kpis_solarcomfort = evaluate(model_solarcomfort_reward, env, "Comfort+solar reward")
# kpis_custom = evaluate(model_custom_reward, env, "Custom reward")

In [73]:
kpis = pd.DataFrame()

kpis = kpis.append(kpis_energy)
kpis = kpis.append(kpis_comfort)
kpis = kpis.append(kpis_solar)
kpis = kpis.append(kpis_solarcomfort)
# kpis.append(kpis_custom)

pd.set_option('display.max_rows', 1000)
kpis = kpis.sort_values(["cost_function", "Reward function"])
display(kpis)

name,Reward function,Building_1,Building_2,Building_3,District
cost_function,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
annual_normalized_unserved_energy_total,Comfort reward,0.031561,0.015283,0.03001,0.025618
annual_normalized_unserved_energy_total,Comfort+solar reward,0.032479,0.033413,0.02909,0.031661
annual_normalized_unserved_energy_total,Energy reward,0.082078,0.092174,0.047404,0.073886
annual_normalized_unserved_energy_total,Solar reward,0.067868,0.06962,0.051745,0.063078
annual_peak_average,Comfort reward,,,,1.007538
annual_peak_average,Comfort+solar reward,,,,1.004061
annual_peak_average,Energy reward,,,,0.662279
annual_peak_average,Solar reward,,,,0.834203
carbon_emissions_total,Comfort reward,0.865418,1.191482,0.811218,0.956039
carbon_emissions_total,Comfort+solar reward,0.698398,0.890339,0.730786,0.773174
