## SmartGrid with DQN


### SmartGrid with DQN 학습 목표

**주제** : SmartGrid Envrionment 에 DQN 적용해 보기.

In [None]:
import grid2op
from grid2op.PlotGrid import PlotMatplot 
from tqdm import tqdm, notebook 
from grid_agent import GridAgent

import numpy as np
import collections
import random

In [None]:
env = grid2op.make("rte_case5_example", test=True)
plot_helper = PlotMatplot(env.observation_space)

SmartGrid 환경은 gym의 구조와 유사하지만, state(observation)와 action에 차이가 있습니다.
그 부분을 먼저 살펴 보겠습니다.

### Converter

SmartGrid 환경은 독자적인 state와 action의 형식을 사용합니다. 위와 같이 일반적인 Gym과 같은 형식으로 test하는 것은 가능하지만, 이 전에 구현한 DQN을 통해 학습하는 것은 불가능하다는 사실을 알 수 있습니다. 

사전에 구현한 DQN을 이용해 Smart Grid 환경에서 학습을 진행하기 위해서는, state와 action을 network model에 맞춰 변환시켜 줘야 하며, Smart Grid 환경은 이를 위한 Converter 함수를 제공합니다.

사전에 구현된 Converter 함수를 이용해 observation과 action을 변환해 보겠습니다.

In [None]:
# Grid 환경에 맞춰 Conveter를 추가한 Agent 사용.
agent = GridAgent(env=env)

### Observation space

**agent.convert_obs(obs)**

convert_obs 함수는 grid 환경의 observation을 input으로 받아 학습에 적합한 gym 스타일의 observation (ndarray)로 변환시켜 줍니다.

In [None]:
### Observation space
grid_obs = env.reset()
print(grid_obs)

gym_obs = agent.convert_obs(grid_obs)
print(gym_obs.shape)

### Action space

**agent.id_to_act(act(int))**

id_to_act 함수는 int형의 action을 input으로 받아, grid 환경에 사용 가능한 action으로 변환시켜 줍니다.

In [None]:
### Action space
print("number of action = ", agent.num_actions)

gym_act = np.random.randint(agent.num_actions)
print("int-like action = ", gym_act)

grid_act = agent.id_to_act(gym_act)
print("grid_ action \n", grid_act)

action을 모두 출력해 보겠습니다.

In [None]:
for i in range(agent.num_actions):
    print(i, agent.id_to_act(i))

In [None]:
import tensorflow as tf
import tensorflow.keras.layers as kl
import tensorflow.keras.optimizers as ko
from collections import deque
from tqdm import tqdm, notebook  # 학습 과정을 더 깔끔하게 보여주는 library 입니다.

In [None]:
buffer_size=5000
learning_rate=0.01
epsilon=1.0
epsilon_decay=0.99
min_epsilon=0.01
gamma=0.98
batch_size=16
target_update_iter=400
train_nums=5000
start_learning = 40
max_iter=200   # 한 에피소드 내에서 최대 step 수를 제한합니다.

# Neural Network Model 
class Model(tf.keras.Model):
    def __init__(self, num_actions, units=[32, 32]):
        super().__init__()
        self.fc1 = kl.Dense(units[0], activation='relu', kernel_initializer='he_uniform')
        self.fc2 = kl.Dense(units[1], activation='relu', kernel_initializer='he_uniform')
        self.logits = kl.Dense(num_actions, name='q_values')

    
    # forward propagation
    def call(self, inputs):
        x = self.fc1(inputs)
        x = self.fc2(x)
        x = self.logits(x)
        return x

    # return best action that maximize action-value (Q) from network
    # a* = argmax_a' Q(s, a')
    def action_value(self, obs):
        q_values = self.predict(obs)
        best_action = np.argmax(q_values, axis=-1)
        return best_action[0]

class ReplayBuffer:
    def __init__(self, buffer_size):
        self.buffer_size = buffer_size
        self.count = 0
        self.buffer = deque(maxlen=buffer_size) 

    # store transition of each step in replay buffer
    def store(self, s, a, r, next_s, d):
        experience = (s, a, r, d, next_s)
        self.buffer.append(experience)
        self.count += 1

    # Sample random minibatch of transtion
    def sample(self, batch_size):
        batch = []
        if self.count < batch_size:
            batch = random.sample(self.buffer, self.count)
        else:
            batch = random.sample(self.buffer, batch_size)

        s_batch, a_batch, r_batch, d_batch, s2_batch = map(np.array, list(zip(*batch)))
        return s_batch, a_batch, r_batch, s2_batch, d_batch
    
    def clear(self):
        self.buffer.clear()
        self.count = 0

### Smart Grid Environment Test

2가지 action type을 test해보겠습니다.

1. **random** : random한 action을 수행합니다.

2. **do_nothing** : action 0 번, 전력망 환경에 어떤 영향도 끼치지 않는 do-nothing action을 수행합니다. 


In [None]:
action_types = ["random", "do_nothing"]
max_steps = env.chronics_handler.max_timestep()

# test before train
for action_type in action_types:
    done = False
    obs, done, ep_reward = env.reset(), False, 0
    actions = []
    msg = "Smart grid [ {} ] agent".format(action_type)
    for t in notebook.tqdm(range(max_steps), desc=msg):
        if action_type == "random":  # select random action
            action = np.random.randint(agent.num_actions) # select do-nothing action ( action number 0)
        elif action_type == "do_nothing":
            action = 0
        actions.append(action)
        converted_act = agent.id_to_act(action)
        obs, reward, done, _ = env.step(converted_act)
        ep_reward += reward
        if done:
            break
    print(actions[:10])

In [None]:
replay_buffer = ReplayBuffer(buffer_size)
network = Model(agent.num_actions)
target_network = Model(agent.num_actions)
target_network.set_weights(network.get_weights()) # initialize target network weight 
opt = ko.Adam(learning_rate=.0015)
network.compile(optimizer=opt, loss='mse')

obs = env.reset()
epi_duration = 0
epi = 0 # number of episode taken
epsilon=1.0

for t in notebook.tqdm(range(1, train_nums+1), desc='train with DQN'):
    # epsilon update
    if epsilon > min_epsilon:
        epsilon = max(epsilon * epsilon_decay, min_epsilon)

    #######################  step 1  ####################### 
    ####        Select action using episolon-greedy      ### 
    ########################################################   

    
    #######################  step 2  ####################### 
    #### Take step and store transition to replay buffer ### 
    ########################################################
    

    
    
    #######################  step 3  ####################### 
    ####     Train network (perform gradient descent)    ### 
    ########################################################
    
    

    #######################  step 3  ####################### 
    ####             Update target network               ### 
    ########################################################
      

    obs = next_obs  # s <- s'
    # if episode ends (done)
    if (epi_duration >= max_iter) or done:
        epi += 1 # num of episode 
        print("[Episode {:>5}] epi duration: {:>6.2f}  --eps : {:>4.2f} --steps : {:>5}".format(epi, epi_duration, epsilon, t))
        obs, done, epi_duration = env.reset(), False, 0  # Environmnet reset


In [None]:
max_steps = env.chronics_handler.max_timestep()

# test before train
for i in range(5):
    obs, done, ep_reward = env.reset(), False, 0
    actions = []
    for t in notebook.tqdm(range(max_steps), desc=""):
        converted_obs = agent.convert_obs(obs)
        action = network.action_value(np.atleast_2d(converted_obs))
        actions.append(action)
        converted_act = agent.id_to_act(action)
        obs, reward, done, _ = env.step(converted_act)
        ep_reward += reward
        if done:
            break