**Data Science and AI for Energy Systems** 

Karlsruhe Institute of Technology

Institute of Automation and Applied Informatics

Summer Term 2024

---

# Exercise XII: Reinforcement Learning

**Remark:** Use the updated docker image with version 1.0.1 (or arm-1.0.1) from this exercise onwards

**Imports**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import keras.backend as K
import tensorflow as tf
from collections import deque
import random

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Dense
from keras.initializers import glorot_uniform
from keras.optimizers import Adam

# import gym
# from gym import spaces
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Problem XII.3 (programming) - Implementing Deep Reinforcement Learning Methods in A Building Energy Management Problem 

In this programming task, we will work on a building installed with photovoltaic (PV) panels and batteries (storage). Data to be used here is named as `building_battery_data.csv`. This data is recorded from a house within this project [SonyCSL](https://www.sonycsl.co.jp/tokyo/daisuke/14820/). Below is a figure showing the configuration of the building.
<img src="house.png" alt="Local Image" width="800">
The energy flow of this building can be simplified as
<img src="flow.png" alt="Local Image" width="1200">

***
**(a) Load the dataset**

In [None]:
data = pd.read_csv('building_energy_data.csv')
# Display the first few rows of the dataset
data.head()

In [None]:
T = int( len(data) - 1/24 ) # 345 days of data
data = data[:T]

fig, ax = plt.subplots(1, 1, figsize = (12, 10))
ax2 = ax.twinx()

charge_discharge_power = data['charge_discharge_power'] # W
rsoc                   = data['rsoc'] # % 
pvc_charge_power       = data['pvc_charge_power'] # W
battery_current        = data['battery_current'] # A(DC)
p2                     = data['p2'] # W
ups_output_power       = data['ups_output_power']

battery_voltage = 52. # for overall battery voltage

# 2019-01-01 one day data
charge_discharge_power_plot = ax.plot(charge_discharge_power[0:24], 'm*-', label = "char_dischar")
rosc_plot                   = ax2.plot(rsoc[0:24], 'go-', label = "rsoc")
pvc_charge_power_plot       = ax.plot(pvc_charge_power[0:24], 'r*-', label = "pvc_char")
battery_current_plot        = ax2.plot(battery_current[0:24], 'k.-', label = "battery_current")
p2_plot                     = ax.plot(p2[0:24], 'b*-', label = "p2")
ups_output_power_plot       = ax.plot(ups_output_power[0:24], 'y*-', label = "ups_output_power")


# Show all label in one box
plots = charge_discharge_power_plot + rosc_plot + pvc_charge_power_plot + battery_current_plot + ups_output_power_plot + p2_plot
labels = [plot.get_label() for plot in plots]
ax.legend(plots, labels, loc = 0, fontsize=14)

ax.set_xlabel("hours of a day", fontsize=14)
ax.set_xticks(range(0, 25, 1))
ax.set_ylabel("Power (W)", fontsize=14)
ax2.set_ylabel("A (DC) / %", fontsize=14)

plt.grid()
plt.show()

**State concatenate**

In [None]:
pv = data[['pvc_charge_power']].values
load = data[['ups_output_power']].values
# ups_output_power + p1 -  p2
p2 = data[['p2']].values

x = np.concatenate([pv, load, p2], axis = -1)

**Deep Q-Network Model**

In [None]:
class DQNNet():
    
    def __init__(self, state_size, action_size, learning_rate):

        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.model = self.create_model()

    def create_model(self):
        
        # state_size = (3, ) 
        input = Input(shape = self.state_size)

        x = Dense(64, activation = "relu", 
                  kernel_initializer = glorot_uniform(seed = 42))(input)
        x = Dense(256, activation = "relu",
                  kernel_initializer = glorot_uniform(seed = 42))(x)

        output = Dense(self.action_size, activation = "linear", 
                  kernel_initializer = glorot_uniform(seed = 42))(x)

        model = Model(inputs = [input], outputs = [output])
        model.compile(loss = "mse", optimizer = Adam(lr = self.learning_rate))
        model.summary()

        return model  

**Memory Model**

In [None]:
# A tree based array containing priority of each experience for fast sampling
# This SumTree code is a modified version and the original code is from:
# https://github.com/jaromiru/AI-blog/blob/master/SumTree.py

class SumTree():
            
    """
    __init__ - create data array storing experience and a tree based array storing priority
    add - store new experience in data array and update tree with new priority
    update - update tree and propagate the change through the tree
    get_leaf - find the final nodes with a given priority value
    
    store data with its priority in the tree.
    """

    data_pointer = 0

    def __init__(self, capacity):
          
        """
        capacity - Number of final nodes containing experience, for all priority values
        data - array containing experience (with pointers to Python objects), for all transitions
        tree - a tree shape array containing priority of each experience

        tree index:
            0       -> storing priority sum
           / \
          1   2
         / \ / \
        3  4 5  6   -> storing priority for transitions
        
        Array type for storing:
        [0, 1, 2, 3, 4, 5, 6]
        """
        
        self.capacity = capacity        
        self.tree = np.zeros(2 * capacity - 1)
        self.data = np.zeros(capacity, dtype = object)

    def add(self, priority, data):
        
        # Start from first leaf node of the most bottom layer
        tree_index = self.data_pointer + self.capacity - 1

        self.data[self.data_pointer] = data # Update data frame
        self.update(tree_index, priority) # Update priority

        # Overwrite if exceed memory capacity
        self.data_pointer += 1
        if self.data_pointer >= self.capacity:
            self.data_pointer = 0

    def update(self, tree_index, priority):

        # Change = new priority score - former priority score
        change = priority - self.tree[tree_index] 
        self.tree[tree_index] = priority

        # Propagate the change through tree
        while tree_index != 0:  # this method is faster than the recursive loop in the reference code
            tree_index = (tree_index - 1) // 2
            self.tree[tree_index] += change

    def get_leaf(self, v):

        parent_index = 0

        while True:  # while loop is faster than the method in the reference code
            left_child_index = 2 * parent_index + 1  # this leaf's left and right kids
            right_child_index = left_child_index + 1
            # Downward search, always search for a higher priority node till the last layer
            if left_child_index >= len(self.tree): # reach the bottom, end search
                leaf_index = parent_index
                break
            else:    # downward search, always search for a higher priority node
                if v <= self.tree[left_child_index]:
                    parent_index = left_child_index
                else:
                    v -= self.tree[left_child_index]
                    parent_index = right_child_index

        data_index = leaf_index - self.capacity + 1

        # tree leaf index, priority, experience
        return leaf_index, self.tree[leaf_index], self.data[data_index]

In [None]:
class Memory():  # stored as (s, a, r, s_) in SumTree
        
    """

    __init__ - create SumTree memory
    store - assign priority to new experience and store with SumTree.add & SumTree.update
    sample - uniformly sample from the range between 0 and total priority and 
           retrieve the leaf index, priority and experience with SumTree.get_leaf
    batch_update - update the priority of experience after training with SumTree.update

    PER_e - Hyperparameter that avoid experiences having 0 probability of being taken
    PER_a - Hyperparameter that allows tradeoff between taking only experience with 
          high priority and sampling randomly (0 - pure uniform randomness, 1 -
          select experiences with the highest priority), convert the importance of TD error to priority
    PER_b - Importance-Sampling (IS), from initial value increasing to 1, control how much beta
          IS affect learning
    """
  
    PER_e = 0.01 
    PER_a = 0.6
    PER_b = 0.4
    PER_b_increment_per_sampling = 0.001
    absolute_error_upper = 1.  # Clipped abs error

    def __init__(self, capacity):
        self.tree = SumTree(capacity)

    def store(self, experience):
        
        # Find the max priority
        max_priority = np.max(self.tree.tree[-self.tree.capacity:])

        # If the max priority = 0, this experience will never have a chance to be selected
        # So a minimum priority is assigned
        if max_priority == 0:
            max_priority = self.absolute_error_upper

        self.tree.add(max_priority, experience)  # set the max priority for new priority

    def sample(self, n):
        
        """
        First, to sample a minibatch of k size, the range [0, priority_total] is
        divided into k ranges. A value is uniformly sampled from each range. Search 
        in the sumtree, the experience where priority score correspond to sample 
        values are retrieved from. Calculate IS weights for each minibatch element
        """

        b_idx = np.empty((n, ), dtype=np.int32)
        b_memory = [] # np.empty((n, self.tree.data[0].size))        
        b_ISWeights =  np.empty((n, 1))

        priority_segment = self.tree.tree[0] / n   

        self.PER_b = np.min([1., self.PER_b + self.PER_b_increment_per_sampling]) # max = 1

        prob_min = np.min(self.tree.tree[-self.tree.capacity:]) / self.tree.tree[0] # for later calculate ISweight
        max_weight = (prob_min * n) ** (-self.PER_b)

        for i in range(n):
            a = priority_segment * i
            b = priority_segment * (i + 1)
            value = np.random.uniform(a, b)
            index, priority, data = self.tree.get_leaf(value)
            prob = priority / self.tree.tree[0]
            b_ISWeights[i, 0] = (prob * n) ** (-self.PER_b) / max_weight               
            b_idx[i]= index
            b_memory.append([data])

        return b_idx, b_memory, b_ISWeights

    def batch_update(self, tree_idx, abs_errors):
        # To avoid 0 probability
        abs_errors += self.PER_e # convert to abs and avoid 0
        clipped_errors = np.minimum(abs_errors, self.absolute_error_upper)
        ps = np.power(clipped_errors, self.PER_a)

        for ti, p in zip(tree_idx, ps):
            self.tree.update(ti, p)

**Battery Model**

***
**(b) Set up the environment**

The RSOC and battery current have the following relationship:
\begin{equation} 
RSOC_{t+1} = \left\{\begin{matrix}
 RSOC_{t+1} + (K_d \times a_t - Decay) \times timestep &  \text{if} a_t<0, \text{discharge}\\
 RSOC_{t+1} + (K_c \times a_t - Decay) \times timestep &  \text{if} a_t>0, \text{charge}\\
 RSOC_{t+1} - Decay \times timestep &  \text{if} a_t=0, \text{idle},\\
\end{matrix}\right.
\end{equation}
where $K_d$ and $K_c$ is the discharge and charge coefficients, Decay is the battery natural decay. In addition, from the above sketch of the energy flow, we can set the reward function:

* reward = -p2\_sim
    * battery\_charge\_power = battery\_voltage $\times$ action
    * p2\_sim = battery\_charge\_power + load - pvc\_charge\_power

In [None]:
# Define the Battery environment
class BatteryEnv():
    
    def __init__(self, action_size):
        """
        coeff_d - discharge coefficient
        coeff_c - charge coefficient

        wear cost is not considered at this moment
        actions space is 3, where
        a = -1, battery discharge
        a = 0,  battery in idle
        a = 1,  battery charge
        """
        self.action_set      = np.linspace(-35, 35, num=action_size, endpoint=True)
        self.initial_rsoc    = 30.
        self.coeff_c         = 0.02
        self.coeff_d         = 0.02
        self.decay           = 0.0018
    
    # def _get_state(self):
    #     return self.data.iloc[self.current_step].values    
    # 
    # def reset(self):
    #     self.current_step = 0
    #     self.state = self._get_state()
    #     return self.state

    def step(self, state, action, timestep):
        current_pv   = state[0]
        current_load = state[1]
        current_p2   = state[2]
        current_rsoc = state[3]

        # rsoc
        if self.action_set[action] < 0: #== -1:   # discharge
            next_rsoc = current_rsoc + (self.coeff_d * self.action_set[action] - self.decay) * timestep
            next_rsoc = np.maximum(next_rsoc, 20.)

        elif self.action_set[action] > 0: #== 1:   # charge
            next_rsoc = current_rsoc + (self.coeff_c * self.action_set[action] - self.decay) * timestep
            next_rsoc = np.minimum(next_rsoc, 100.)

        else:  # idle
            next_rsoc = current_rsoc - self.decay * timestep
            next_rsoc = np.maximum(next_rsoc, 20.)
            
        next_rsoc = np.array([next_rsoc])

        battery_charge_power = battery_voltage * self.action_set[action] # battery_output
        cost = - current_pv + battery_charge_power + current_p2 # house_useage
        p2_sim = current_pv - battery_charge_power
        
        # reward function, bill price: fixed
        reward =  np.minimum(- cost, 0.)

        return next_rsoc, reward , p2_sim

**Hyperparameters** 

In [None]:
# DQN hyperparameters
state_size = (4, ) # pv_power, consumption, p2, rsoc
action_size = 11
learning_rate = 0.01
lr = 0.001
batch_size = 64
gamma = 0.95     

# Exploration hyperparameters for epsilon greedy strategy
explore_start = 1.0 # exploration probability at start
explore_stop = 0.01 # minimum exploration probability 
decay_rate = 0.001 # exponential decay rate for exploration prob

# Memory hyperparameters
pretrain_length = 10000 # # of experiences stored in Memory during initialization
memory_size = 10000 # # of experiences Memory can keep

**Memory Initialization**

In [None]:
battery = BatteryEnv(action_size = action_size)
                  
memory = Memory(memory_size)

np.random.seed(42)

# Memory initialization
RSOC              = np.array([battery.initial_rsoc])
day               = 0
quarter_hour      = 0
done              = False
timestep = 15.

for i in range(pretrain_length):

    state = np.concatenate((x[day * 24 + quarter_hour, :], RSOC), axis=-1)
    action = np.random.randint(0, action_size)
  
    # Compute the reward and new state based on the selected action
    # next_rsoc, reward
    next_rsoc, reward, p2_sim = battery.step(state, action, timestep)
#     print('next_rsoc: ', next_rsoc, 'reward: ', reward)

    # Store the experience in memory
    if quarter_hour < 24 - 1:
        quarter_hour += 1      
        next_state = np.concatenate((x[day * 24 + quarter_hour, :], next_rsoc), axis = -1)
    else:
        done = True
        day += 1
        quarter_hour = 0
        if day < len(x) / 24:
            next_state = np.concatenate((x[day * 24 + quarter_hour, :], next_rsoc), axis = -1)
        else:
            break
      
    RSOC = next_rsoc
    experience = state, action, reward, next_state, done
    memory.store(experience)

***
**(c) Implement DRL**

**Deep Q-Network (DQN) Training**

In [None]:
DQN = DQNNet(state_size = state_size, 
             action_size = action_size, 
             learning_rate = learning_rate)

decay_step = 0 # Decay rate for ϵ-greedy policy
RSOC = np.array([battery.initial_rsoc])
day = 0
quarter_hour = 0
done = False
timestep = 15.
quarter_hour_rewards = []
day_mean_rewards = []

while day < len(x) / 24:   
  
    state = np.concatenate((x[day * 24 + quarter_hour, :], RSOC), axis = -1)

    # ϵ-greedy policy        
    exp_exp_tradeoff = np.random.rand()
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    if (explore_probability > exp_exp_tradeoff):
        action = np.random.randint(0, action_size)
    else:
        action = np.argmax(DQN.model.predict(np.expand_dims(state, axis = 0)))

    # Compute the reward and new state based on the selected action
    next_RSOC, reward, p2_sim= battery.step(state, action, timestep)     
    #   print('next_rsoc: ', next_RSOC, 'reward: ', reward)

    quarter_hour_rewards.append(reward)
  
    # Store the experience in memory
    if quarter_hour < 24 - 1:
        quarter_hour += 1
        next_state = np.concatenate((x[day * 24 + quarter_hour, :], next_RSOC), axis = -1)
    else:
        done = True
        day += 1
        quarter_hour = 0
        if day < len(x) / 24:
            next_state = np.concatenate((x[day * 24 + quarter_hour, :], next_RSOC), axis = -1)
        else:
            break
        mean_reward = np.mean(quarter_hour_rewards)
        day_mean_rewards.append(mean_reward)
        quarter_hour_rewards = []
        print("Day: {}".format(day),
              "Mean reward: {:.2f}".format(mean_reward),
              "Training loss: {:.2f}".format(loss),
              "Explore P: {:.2f} \n".format(explore_probability))  

    RSOC = next_RSOC 
    experience = state, action, reward, next_state, done
    memory.store(experience)
    decay_step +=1

    # DQN training            
    tree_idx, batch, ISWeights_mb = memory.sample(batch_size) # Obtain random mini-batch from memory
    
    states_mb = np.array([each[0][0] for each in batch])
    actions_mb = np.array([each[0][1] for each in batch])
    rewards_mb = np.array([each[0][2] for each in batch]) 
    next_states_mb = np.array([each[0][3] for each in batch])
    dones_mb = np.array([each[0][4] for each in batch])

    targets_mb = DQN.model.predict(states_mb)
    
#     print('s_mb:',states_mb, 'a_mb:', actions_mb, 'r_mb:', rewards_mb)

    # Update those targets at which actions are taken
    target_batch = []
    q_next_state = DQN.model.predict(next_states_mb)
    for i in range(0, len(batch)):  
        action = np.argmax(q_next_state[i])
        if dones_mb[i] == 1:
            target_batch.append(rewards_mb[i])
        else:
            target = rewards_mb[i] + gamma * q_next_state[i][action]
            target_batch.append(rewards_mb[i])

    # Replace the original with the updated targets
    one_hot = np.zeros((len(batch), action_size))
    one_hot[np.arange(len(batch)), actions_mb] = 1
    targets_mb = targets_mb.astype("float64")
    target_batch = np.array([each for each in target_batch]).astype("float64")
    np.place(targets_mb, one_hot > 0, target_batch)

    loss = DQN.model.train_on_batch(states_mb, targets_mb, sample_weight = ISWeights_mb.ravel())

    # Update priority
    absolute_errors = []
    predicts_mb = DQN.model.predict(states_mb)
    for i in range(0, len(batch)):
        absolute_errors.append(np.abs(predicts_mb[i][actions_mb[i]] - targets_mb[i][actions_mb[i]]))
    absolute_errors = np.array(absolute_errors)

    tree_idx = np.array([int(each) for each in tree_idx])
    memory.batch_update(tree_idx, absolute_errors)


***
**(d) Plot the average reward**

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (12,4))

ax.plot(day_mean_rewards, "b-", label = "reward")

ax.set_xlabel("days of Year 2019", fontsize=14)
ax.set_ylabel("Average reward", fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
ax.legend(loc ='lower right', fontsize=14)