# Deep Q-Learning


## Installation

In [1]:
!pip install git+https://github.com/Total-RD/pymgrid/

Collecting git+https://github.com/Total-RD/pymgrid/
  Cloning https://github.com/Total-RD/pymgrid/ to /tmp/pip-req-build-86s3hdda
  Running command git clone -q https://github.com/Total-RD/pymgrid/ /tmp/pip-req-build-86s3hdda


In [1]:
!pip install tensorflow==2.4.1
#If not work try installing tensorflow 2 version

Collecting tensorflow==2.4.1
  Downloading tensorflow-2.4.1-cp38-cp38-macosx_10_11_x86_64.whl (173.9 MB)
[K     |████████████████████████████████| 173.9 MB 583 bytes/s a 0:00:01    |████████▉                       | 48.0 MB 13.5 MB/s eta 0:00:10     |███████████████████             | 102.7 MB 24.1 MB/s eta 0:00:03     |████████████████████████████▍   | 154.5 MB 20.3 MB/s eta 0:00:01
Collecting flatbuffers~=1.12.0
  Downloading flatbuffers-1.12-py2.py3-none-any.whl (15 kB)
Collecting grpcio~=1.32.0
  Downloading grpcio-1.32.0-cp38-cp38-macosx_10_9_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 35.2 MB/s eta 0:00:01
[?25hCollecting numpy~=1.19.2
  Downloading numpy-1.19.5-cp38-cp38-macosx_10_9_x86_64.whl (15.6 MB)
[K     |████████████████████████████████| 15.6 MB 14.6 MB/s eta 0:00:01
Collecting tensorflow-estimator<2.5.0,>=2.4.0
  Downloading tensorflow_estimator-2.4.0-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 36.0 MB/s eta

## Datas loading

In [2]:
import pickle

"""
The buildings mentionned below are specific to the hackathon and are not available in this repo.
You can replace them with any MicroGrid object generated from pymgrid
"""


with open('building_1.pkl', 'rb') as f:
    building_1 = pickle.load(f)

with open('building_2.pkl', 'rb') as f:
    building_2 = pickle.load(f)
with open('building_3.pkl', 'rb') as f:
    building_3 = pickle.load(f)

buildings = [building_1, building_2, building_3]

In [3]:
from pymgrid.Environments.pymgrid_cspla import MicroGridEnv


In [4]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
from collections import deque
import time
import random


## Training models

In [5]:

def agent(state_shape, action_shape):
    """ The agent maps X-states to Y-actions
    """
    learning_rate = 0.001
    init = tf.keras.initializers.HeUniform()
    model = keras.Sequential()
    model.add(keras.layers.Dense(24, input_shape=state_shape, activation='relu', kernel_initializer=init))
    model.add(keras.layers.Dense(12, activation='relu', kernel_initializer=init))
    model.add(keras.layers.Dense(action_shape, activation='linear', kernel_initializer=init))
    model.compile(loss=tf.keras.losses.Huber(), optimizer=tf.keras.optimizers.Adam(lr=learning_rate), metrics=['accuracy'])
    return model

def get_qs(model, state, step):
    return model.predict(state.reshape([1, state.shape[0]]))[0]

def train(building_environment_contin, replay_memory, model, target_model, done):
# An episode a full game
    train_episodes = 1

    learning_rate = 0.7 # Learning rate
    discount_factor = 0.618

    MIN_REPLAY_SIZE = 1000
    if len(replay_memory) < MIN_REPLAY_SIZE:
        return

    batch_size = 64 * 2
    mini_batch = random.sample(replay_memory, batch_size)
    current_states = np.array([encode_observation(transition[0], building_environment_contin.observation_space.shape) for transition in mini_batch])
    current_qs_list = model.predict(current_states)
    new_current_states = np.array([encode_observation(transition[3], building_environment_contin.observation_space.shape) for transition in mini_batch])
    future_qs_list = target_model.predict(new_current_states)

    X = []
    Y = []
    
    for index, (observation, action, reward, new_observation, done) in enumerate(mini_batch):
        if not done:
            max_future_q = reward + discount_factor * np.max(future_qs_list[index])
        else:
            max_future_q = reward

        current_qs = current_qs_list[index]
        current_qs[action] = (1 - learning_rate) * current_qs[action] + learning_rate * max_future_q

        X.append(encode_observation(observation, building_environment_contin.observation_space.shape))
        Y.append(current_qs)
    model.fit(np.array(X), np.array(Y), batch_size=batch_size, verbose=0, shuffle=True)

def encode_observation(observation, n_dims):
    return observation

def get_model(building_environment_contin):
    print("Action Space: {}".format(building_environment_contin.action_space))
    print("State space: {}".format(building_environment_contin.observation_space))


    epsilon = 1 # Epsilon-greedy algorithm in initialized at 1 meaning every step is random at the start
    max_epsilon = 1 # You can't explore more than 100% of the time
    min_epsilon = 0.01 # At a minimum, we'll always explore 1% of the time
    decay = 0.01

    # Initialize the Target and Main models
    model = agent(building_environment_contin.observation_space.shape, building_environment_contin.action_space.n)
    target_model = agent(building_environment_contin.observation_space.shape, building_environment_contin.action_space.n)
    target_model.set_weights(model.get_weights())

    replay_memory = deque(maxlen=50_000)


    # X = states, y = actions
    X = []
    y = []
    train_episodes=1
    steps_to_update_target_model = 0
    train_start = time.process_time()
    for episode in range(train_episodes):
        total_training_rewards = 0
        observation = building_environment_contin.reset()
        done = False
        while not done:
            steps_to_update_target_model += 1
            #if True:
                #building_environment_contin.render()

            random_number = np.random.rand()
            # Explore using Exploration Strategy
            if random_number <= epsilon:
                # Explore
                action = building_environment_contin.action_space.sample()
            else:
                # Exploit best known action
                encoded = encode_observation(observation, building_environment_contin.observation_space.shape[0])
                encoded_reshaped = encoded.reshape([1, encoded.shape[0]])
                predicted = model.predict(encoded_reshaped).flatten()
                action = np.argmax(predicted)
            new_observation, reward, done, info = building_environment_contin.step(action)
            replay_memory.append([observation, action, reward, new_observation, done])

            #  Update the Main Network 
            if steps_to_update_target_model % 4 == 0 or done:
                train(building_environment_contin, replay_memory, model, target_model, done)

            observation = new_observation
            total_training_rewards += reward

            if done:
                total_training_rewards=total_training_rewards*-1
                print('Total training rewards: {} after n steps = {} with final reward = {}'.format(total_training_rewards, episode, reward))
                total_training_rewards += 1

                if steps_to_update_target_model >= 100:
                    print('Copying main network weights to the target network weights')
                    target_model.set_weights(model.get_weights())
                    steps_to_update_target_model = 0
                break

        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay * episode)
        train_end = time.process_time()
        train_frugality = train_end - train_start
        return target_model,total_training_rewards,train_frugality
    #env.close()

building_train_environments_cont = [MicroGridEnv({'microgrid':building,'testing':False}) for building in buildings]
model1,total_training_reward,train_frugality=get_model(building_train_environments_cont[0])


Action Space: Discrete(5)
State space: Box(-0.1, inf, (10,), float64)
Total training rewards: 12292.812762250354 after n steps = 0 with final reward = -0.0
Copying main network weights to the target network weights


In [6]:
#model2,total_training_reward1,train_frugality1=fine_training_model(model1,building_train_environments_cont[1])
model2,total_training_reward1,train_frugality1=get_model(building_train_environments_cont[1])

Action Space: Discrete(5)
State space: Box(-0.1, inf, (10,), float64)
Total training rewards: 42235.340052658845 after n steps = 0 with final reward = -11.585851296915358
Copying main network weights to the target network weights


In [7]:
#model3,total_training_reward2,train_frugality2=fine_training_model(model1,building_train_environments_cont[1])
model3,total_training_reward2,train_frugality2=get_model(building_train_environments_cont[2])

Action Space: Discrete(6)
State space: Box(-0.1, inf, (10,), float64)
Total training rewards: 31863.52232201471 after n steps = 0 with final reward = -0.0
Copying main network weights to the target network weights


In [8]:
train_frugality=train_frugality+train_frugality1+train_frugality2

## Getting and exporting test results

In [9]:
test_start = time.process_time()
total_cost = [0,0,0]
model_list=[model1,model2,model3]
for i,building_env in enumerate(building_train_environments_cont):
    obs = building_env.reset(testing=True)
    done = False
    while not done:
        encoded = encode_observation(obs, building_env.observation_space.shape[0])
        encoded_reshaped = encoded.reshape([1, encoded.shape[0]])
        predicted = model_list[i].predict(encoded_reshaped).flatten()
        action = np.argmax(predicted)
        obs, reward, done, info = building_env.step(action)
        total_cost[i]+=reward

test_end = time.process_time()
test_frugality = test_end - test_start

In [10]:
final_results = {
    "building_1_performance" : total_cost[0]*-1,
    "building_2_performance" : total_cost[1]*-1,
    "building_3_performance" : total_cost[2]*-1,
    "frugality" : train_frugality + test_frugality,
}
print(final_results)

{'building_1_performance': 4348.789999711021, 'building_2_performance': 13643.535942446842, 'building_3_performance': 16801.189979035178, 'frugality': 599.1154839999999}
