In [None]:
from citylearn import  CityLearn
import matplotlib.pyplot as plt
from pathlib import Path
import numpy as np
np.random.seed(3)

In [None]:
# Load environment
data_folder = Path("data/")
building_attributes = data_folder / 'building_attributes.json'
solar_profile = data_folder / 'solar_generation_1kW.csv'
building_state_actions = 'buildings_state_action_space.json'
building_ids = ["Building_1","Building_2","Building_3","Building_4","Building_5","Building_6","Building_7","Building_8","Building_9"]
env = CityLearn(building_attributes, solar_profile, building_ids, buildings_states_actions = building_state_actions, cost_function = ['quadratic'])
observations_spaces,actions_spaces = env.get_state_action_spaces()

In [None]:
# RL CONTROLLER
from reward_function import reward_function
from agent import TD3_Agents
import time

#Instantiatiing the control agent(s)
agents = TD3_Agents(observations_spaces,actions_spaces)

k = 0
cost, cum_reward = {}, {}
'''
Every episode runs the RL controller for over 2500 hours in the simulation (controls the storage of cooling energy over about 3 Summer months)
Multiple episodes will run the RL controller over the same period over and over again, improving the policy and maximizing the score (less negative scores are better)
CHALLENGE:
Running the RL algorithm to coordinate as many buildings as possible (at least 3 or 4) and find a good control policy in the minimum number of episodes as possible (ideally within a single episode)
'''
start = time.time()
episodes = 7
for e in range(episodes): #A stopping criterion can be added, which is based on whether the cost has reached some specific threshold or is no longer improving
    cum_reward[e] = 0
    rewards = []
    state = env.reset()
    done = False
    while not done:
        if k%500==0:
            print('hour: '+str(k)+' of '+str(2500*episodes))
            
        action = agents.select_action(state)
        next_state, reward, done, _ = env.step(action)
        reward = reward_function(reward) #See comments in reward_function.py
        agents.add_to_buffer(state, action, reward, next_state, done)
        state = next_state
        cum_reward[e] += reward[0]
        rewards.append(reward)
        k+=1
    cost[e] = env.cost()
end = time.time()
print((end-start)/60.0)

In [None]:
env.cost()

In [None]:
#Prints the cost of every episode. The lower the better. Ideally it will reach the same cost or lower than the RBC: 156.88
cost

In [None]:
#Prints the actor and critic losses
fig, (plot1, plot2, plot3) = plt.subplots(1,3)
fig.set_size_inches(12,4)
plot1.plot(agents.critic1_loss_list[0],'b')
plot2.plot(agents.critic2_loss_list[0],'g')
plot3.plot(agents.actor_loss_list[0],'y')
plot1.set_xlabel('hours*iterations')
plot1.set_ylabel('Critic 1 loss')
plot2.set_xlabel('hours*iterations')
plot2.set_ylabel('Critic 2 loss')
plot3.set_xlabel('hours*iterations')
plot3.set_ylabel('Actor loss')

In [None]:
#Prints the actor and critic losses
fig, (plot1, plot2, plot3) = plt.subplots(1,3)
fig.set_size_inches(12,4)
plot1.plot(agents.critic1_loss_list[0],'b')
plot2.plot(agents.critic2_loss_list[0],'g')
plot3.plot(agents.actor_loss_list[0],'y')
plot1.set_xlabel('hours*iterations')
plot1.set_ylabel('Critic 1 loss')
plot2.set_xlabel('hours*iterations')
plot2.set_ylabel('Critic 2 loss')
plot3.set_xlabel('hours*iterations')
plot3.set_ylabel('Actor loss')

In [None]:
#Prints the actor and critic losses
fig, (plot1, plot2, plot3) = plt.subplots(1,3)
fig.set_size_inches(12,4)
plot1.plot(agents.critic1_loss_list[0],'b')
plot2.plot(agents.critic2_loss_list[0],'g')
plot3.plot(agents.actor_loss_list[0],'y')
plot1.set_xlabel('hours*iterations')
plot1.set_ylabel('Critic 1 loss')
plot2.set_xlabel('hours*iterations')
plot2.set_ylabel('Critic 2 loss')
plot3.set_xlabel('hours*iterations')
plot3.set_ylabel('Actor loss')

In [None]:
#Prints the actor and critic losses
fig, (plot1, plot2, plot3) = plt.subplots(1,3)
fig.set_size_inches(12,4)
plot1.plot(agents.critic1_loss_list[0],'b')
plot2.plot(agents.critic2_loss_list[0],'g')
plot3.plot(agents.actor_loss_list[0],'y')
plot1.set_xlabel('hours*iterations')
plot1.set_ylabel('Critic 1 loss')
plot2.set_xlabel('hours*iterations')
plot2.set_ylabel('Critic 2 loss')
plot3.set_xlabel('hours*iterations')
plot3.set_ylabel('Actor loss')

In [None]:
#Plots for the last 100 hours of the simulation
plt.plot(env.buildings[0].cooling_device.electrical_consumption_cooling[2400:])
plt.plot(env.buildings[0].sim_results['cooling_demand'][3500:6000].values[2400:])
plt.plot(env.buildings[0].cooling_device.cooling_supply[2400:])
plt.legend(['Electrical consumption cooling','Building cooling demand','Heat pump cooling supply'])

In [None]:
#Plots for the last 100 hours of the simulation
plt.plot(env.buildings[0].cooling_storage.soc_list[2400:])
plt.plot(env.buildings[0].cooling_storage.energy_balance_list[2400:])
plt.legend(['State of Charge','Storage device energy balance'])

In [None]:
#Plots for the last 100 hours of the simulation
plt.plot(env.action_track[8][-100:])
plt.plot(env.buildings[0].cooling_device.cop_cooling_list[-100:])
plt.legend(['RL Action','Heat Pump COP'])

When the RL algorithm reaches convergence this plot should show the heat pump loading the energy storage device then the COP is high and releasing the energy when the COP is low.

In [None]:
#Plots all the actions across episodes (includes exploration noise and environment constraints)
plt.plot(env.action_track[8][-100:])

In [None]:
plt.plot(agents.a_track1)

In [None]:
plt.plot(agents.a_track2)