In [40]:
import pickle
import matplotlib.pyplot as plt
import numpy as np
from scipy.special import softmax
from tqdm.notebook import tqdm

import tensorflow as tf
from tensorflow.keras.layers import Input, SimpleRNN, Dense, Flatten, LSTM, GRU, BatchNormalization, LeakyReLU
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
from tensorflow.keras.utils import to_categorical  
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.backend import get_value, clip
from tensorflow.python.eager.context import eager_mode, graph_mode

Collecting git+https://github.com/Total-RD/pymgrid/
  Cloning https://github.com/Total-RD/pymgrid/ to /tmp/pip-req-build-p0zz1rgu
  Running command git clone -q https://github.com/Total-RD/pymgrid/ /tmp/pip-req-build-p0zz1rgu


In [41]:
"""
The buildings mentionned below are specific to the hackathon and are not available in this repo.
You can replace them with any MicroGrid object generated from pymgrid
"""
with open('building_1.pkl', 'rb') as f:
    building_1 = pickle.load(f)

with open('building_2.pkl', 'rb') as f:
    building_2 = pickle.load(f)
    
with open('building_3.pkl', 'rb') as f:
    building_3 = pickle.load(f)

buildings = [building_1, building_2, building_3]

In [42]:
### Import the Gym environnement with continuous States & discrete actions
from pymgrid.Environments.pymgrid_cspla import MicroGridEnv

## Actor Critic

In [43]:
class Agent():
        
    def __init__(self, env):
        self.env=env #import env
        self.state_shape=env.observation_space.shape # the state space
        self.action_shape=env.action_space.n # the action space
        self.gamma=0.99 # decay rate of past observations
        self.alpha=1e-4 # learning rate in the policy gradient
        self.lr=0.01 # learning rate in deep learning
        self.model=self.build_agent(self.state_shape)
            
        self.state_memory = []
        self.action_memory = []
        self.reward_memory = []
    
    def build_agent(self, s_size):
        i=tf.keras.layers.Input(s_size)
        l_1 = tf.keras.layers.Dense(256,activation=LeakyReLU(alpha=0.2))(i)
        #l_2 = tf.keras.layers.BatchNormalization()(l_1)
        l_3 = tf.keras.layers.Dense(256,activation=LeakyReLU(alpha=0.2))(l_1)
        l_4 = tf.keras.layers.Dense(256,activation=LeakyReLU(alpha=0.2))(l_1)
        
        action = tf.keras.layers.Dense(5, activation="softmax")(l_3)
        critic = layers.Dense(1)(l_4)

        model = tf.keras.Model(inputs=i, outputs=[action, critic])
        return model

In [33]:
lr=0.001
for i in range(10):
    lr =lr/(1+)
    print(lr)

0.001
0.0009000000000000001
0.0007290000000000002
0.0005314410000000002
0.00034867844010000017
0.00020589113209464911
0.00010941898913151243
5.23347633027361e-05
2.25283995449392e-05
8.727963568087724e-06


In [None]:
building_environment = MicroGridEnv(env_config={'microgrid':buildings[0],"testing":False})
env = building_environment

lr_schedule = tf.keras.optimizers.schedules. ExponentialDecay(
    initial_learning_rate=1e-3,
    decay_steps=10,
    decay_rate=0.05)


optimizer = tf.keras.optimizers.Adam(agent.lr)
huber_loss = tf.keras.losses.Huber()
eps = np.finfo(np.float32).eps.item()
episodes = 50

agent = Agent(env)
model = agent.model
#model.compile(optimizer= optimizer(agent.lr),loss=huber_loss)

action_probs_history = []
critic_value_history = []
rewards_history = []
running_reward = 0
last_episode_reward =0
episode_count = 0

for episode in tqdm(range(episodes)):
    done = False
    state = env.reset()
    episode_reward = 0
    j=0
    
    with tf.GradientTape() as tape:
        
        while not done:  # Run until solved
            #j+=1
            #if j%100==0:print(j,end=" ",flush=True)
            state = tf.convert_to_tensor(state)
            state = tf.expand_dims(state, 0)
            
            action_probs, critic_value = model(state)
            critic_value_history.append(critic_value[0, 0])
            
             # Sample action from action probability distribution
            action = np.random.choice(agent.action_shape, p=np.squeeze(action_probs))
            action_probs_history.append(tf.math.log(action_probs[0, action]))
            
            state, reward, done, _ = env.step(action)
            rewards_history.append(reward)
            episode_reward += reward
        
        #if episode_reward-last_episode_reward < 100/(episode+1):
            #agent.lr*=1.05
        #elif last_episode_reward>episode_reward:
            #agent.lr/=
        #optimizer = tf.keras.optimizers.Adam(agent.lr)

        returns = []
        discounted_sum = 0
        for r in rewards_history[::-1]:
            discounted_sum = r + agent.gamma * discounted_sum
            returns.insert(0, discounted_sum)
        
        returns = np.array(returns)
        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
        returns = returns.tolist()
        
        history = zip(action_probs_history, critic_value_history, returns)
        actor_losses = []
        critic_losses = []
        
        for log_prob, value, ret in history:
            diff = ret - value
            actor_losses.append(-log_prob * diff)
            
            critic_losses.append(
                huber_loss(tf.expand_dims(value, 0), tf.expand_dims(ret, 0)))
        
        loss_value = sum(actor_losses) + sum(critic_losses)
        grads = tape.gradient(loss_value, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
    
        # Clear the loss and reward history
        action_probs_history.clear()
        critic_value_history.clear()
        rewards_history.clear()
        print(loss_value)
        #if episode_count % 10 == 0:
        last_episode_reward=np.copy(episode_reward)
    template = "running reward: {:.2f} at episode {}"
    print(template.format(episode_reward, episode))

  0%|          | 0/50 [00:00<?, ?it/s]

tf.Tensor(8927.068, shape=(), dtype=float32)
running reward: -11486.06 at episode 0
tf.Tensor(94948.92, shape=(), dtype=float32)
running reward: -5064.46 at episode 1
tf.Tensor(96726.08, shape=(), dtype=float32)
running reward: -5063.70 at episode 2
tf.Tensor(32811.383, shape=(), dtype=float32)
running reward: -5063.70 at episode 3
tf.Tensor(20988.9, shape=(), dtype=float32)
running reward: -5063.70 at episode 4
tf.Tensor(30234.375, shape=(), dtype=float32)
running reward: -5063.70 at episode 5
tf.Tensor(20935.498, shape=(), dtype=float32)
running reward: -5063.70 at episode 6
tf.Tensor(9343.236, shape=(), dtype=float32)
running reward: -5063.70 at episode 7
tf.Tensor(2851.281, shape=(), dtype=float32)
running reward: -5063.70 at episode 8
tf.Tensor(7484.4014, shape=(), dtype=float32)
running reward: -5063.70 at episode 9
tf.Tensor(8752.851, shape=(), dtype=float32)
running reward: -5063.70 at episode 10
tf.Tensor(6090.694, shape=(), dtype=float32)
running reward: -5063.70 at episode 1