In [None]:
import gym
import sinergym
from datetime import datetime
import numpy as np
from scipy.stats import entropy
from stable_baselines import *

import tensorflow as tf
import tensorflow.contrib.distributions as dist
import tensorflow.contrib.layers as layers

## Discriminator Network Setup 

In [None]:
class Discriminator:
    def __init__(self, sess, ob_shape, ac_shape, hidden_size, lr, name):
        self.sess = sess
        self.ob_shape = ob_shape
        self.ac_shape = ac_shape
        self.hidden_size = hidden_size
        self.lr = lr
        self.name = name

        self.ob_ac = tf.placeholder(dtype=tf.float32, shape=[None, 21])
        
        with tf.variable_scope(name):
            self._build_network()


    def _build_network(self):
        with tf.variable_scope('discriminator'):
            d_h1 = layers.fully_connected(self.ob_ac, self.hidden_size, activation_fn=tf.tanh)
            d_h2 = layers.fully_connected(d_h1, self.hidden_size, activation_fn=tf.tanh)
            d_out = layers.fully_connected(d_h2, 1, activation_fn=None)

        self.reward = - tf.squeeze(tf.log(tf.sigmoid(d_out)))
        
        expert_out, policy_out = tf.split(d_out, num_or_size_splits=2, axis=0)

        self.loss = (tf.losses.sigmoid_cross_entropy(tf.ones_like(policy_out), policy_out)
                     + tf.losses.sigmoid_cross_entropy(tf.zeros_like(expert_out), expert_out))
        
        with tf.name_scope('train_op'):
            grads = tf.gradients(self.loss, self.params())
            self.grads = list(zip(grads, self.params()))
            self.train_op = tf.train.AdamOptimizer(self.lr).apply_gradients(self.grads)

    def params(self):
        return tf.global_variables(self.name).copy()

    def get_reward(self, expert_ob_ac):
        feed_dict = {self.ob_ac: expert_ob_ac}
        return self.sess.run(self.reward, feed_dict=feed_dict)

    def update(self, all_ob_ac):
        feed_dict = {self.ob_ac: all_ob_ac}

        self.sess.run(self.train_op, feed_dict=feed_dict)

## Customize Environments

In [None]:
from stable_baselines.common.policies import MlpPolicy, CnnPolicy, FeedForwardPolicy
from stable_baselines.bench import Monitor
import os


class CustomEnv(gym.Env):
    def __init__(self, rl_env, discriminator, lamb):
        super(CustomEnv, self).__init__()
        self.action_space = rl_env.action_space
        self.observation_space = rl_env.observation_space
        self.rl_env = rl_env
        self.discriminator = discriminator
        self.lamb = lamb
    
    def step(self, action):
        observation, rl_reward, done, info = self.rl_env.step(action)
        il_reward = self.discriminator.get_reward((np.concatenate([observation, action], 0)).reshape(1,21))
        reward = self.lamb*rl_reward + (1-self.lamb)*il_reward
        return observation, reward, done, info
    
    def reset(self):
        return self.rl_env.reset()

## RL + IL

In [None]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import TRPO, PPO2
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

env = gym.make('Eplus-5Zone-mixed-continuous-v1')
config = tf.ConfigProto(
device_count={'GPU': 1},
intra_op_parallelism_threads=1,
allow_soft_placement=True
)
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
tf.keras.backend.set_session(sess)
discriminator = Discriminator(sess, env.observation_space.shape, env.action_space.shape, 10, 0.01, 'D')
sess.run(tf.global_variables_initializer())
tf.reset_default_graph()

# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

# the noise objects for DDPG
n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions))


# Create and wrap the environment
custom_env = CustomEnv(env, discriminator, 0.3)
custom_env = Monitor(custom_env, log_dir, allow_early_resets=True)
custom_env = DummyVecEnv(([lambda: custom_env]))
data = np.load('expert_traj/ddpg_expert_5zonemix.npz')
expert_obs = data['obs']
expert_actions = data['actions']
expert_ob_ac = np.concatenate([expert_obs, expert_actions], axis = 1)

# Generate expert trajectories (train expert)
model1 = PPO2('MlpPolicy', custom_env, verbose=0)

ts = 0
batch_ts = len(expert_ob_ac)
total_ts = batch_ts * 50


rew_sum = []
while ts < total_ts:
    model1.set_env(custom_env)
    model1.learn(batch_ts)
    model1.save("agent_simu")
    ts += batch_ts
    
    ### Option 2: RL after IL
    eps = int(ts/len(expert_ob_ac))
    lamb = 0.7
    if eps <= 5:
        lamb = 0.3
    
    obs = custom_env.reset()[0]
    observes, actions, rewards = [],[],[]
    
    i = 0
    while i < len(expert_ob_ac):
        observes.append(obs)
        action, _states = model1.predict(obs)
        obs, reward, done, info = env.step(action)
        actions.append(action)
        rewards.append(reward)
        if done:
            obs = custom_env.reset()[0]
            rew_sum.append(np.sum(rewards))
            rewards = []
        i += 1
        
    policy_ob_ac = np.concatenate([observes, actions],1)
    for k in range(5):
        discriminator.update(np.concatenate([expert_ob_ac, policy_ob_ac], axis=0))
    custom_env = CustomEnv(env, discriminator, lamb)
    custom_env = DummyVecEnv(([lambda: custom_env]))
    model1.load("agent_simu")
    print('Timesteps: ' + str(ts) + ' Rewards Sum: ' + str(rew_sum[-1]))
print(rew_sum)

## IL only

In [None]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import PPO2
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

env = gym.make('Eplus-5Zone-mixed-continuous-v1')
config = tf.ConfigProto(
device_count={'GPU': 1},
intra_op_parallelism_threads=1,
allow_soft_placement=True
)
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
tf.keras.backend.set_session(sess)
discriminator = Discriminator(sess, env.observation_space.shape, env.action_space.shape, 10, 0.01, 'D')
sess.run(tf.global_variables_initializer())
tf.reset_default_graph()

# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)
        
# Create and wrap the environmentc
custom_env = CustomEnv(env, discriminator, 0.2)
custom_env = Monitor(custom_env, log_dir, allow_early_resets=True)
custom_env = DummyVecEnv(([lambda: custom_env]))
data = np.load('expert_traj/ddpg_expert_5zonemix.npz')
expert_obs = data['obs']
expert_actions = data['actions']
expert_ob_ac = np.concatenate([expert_obs, expert_actions], axis = 1)

model2 = PPO2(MlpPolicy, custom_env, verbose=0)

ts = 0
batch_ts = len(expert_ob_ac)
total_ts = batch_ts * 50
lamb = 0.2

rew_sum = []
### IL only training
while ts < total_ts:
    model2.set_env(custom_env)
    model2.learn(batch_ts)
    model2.save("agent_il_only")
    ts += batch_ts
    
    obs = custom_env.reset()[0]
    observes, actions, rewards = [],[],[]
    
    i = 0
    while i < len(expert_ob_ac):
        observes.append(obs)
        action, _states = model2.predict(obs)
        obs, reward, done, info = env.step(action)
        actions.append(action)
        rewards.append(reward)
        if done:
            obs = custom_env.reset()[0]
            rew_sum.append(np.sum(rewards))
            rewards = []
        i += 1
        
    policy_ob_ac = np.concatenate([observes, actions],1)
    for k in range(5):
        discriminator.update(np.concatenate([expert_ob_ac, policy_ob_ac], axis=0))
    custom_env = CustomEnv(env, discriminator, lamb)
    custom_env = DummyVecEnv(([lambda: custom_env]))
    model2.load("agent_il_only")
    print('Timesteps: ' + str(ts) + ' Rewards Sum: ' + str(rew_sum[-1]))
print(rew_sum)

## RL only

In [None]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import PPO2
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

env = gym.make('Eplus-5Zone-mixed-continuous-v1')
config = tf.ConfigProto(
device_count={'GPU': 1},
intra_op_parallelism_threads=1,
allow_soft_placement=True
)
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
tf.keras.backend.set_session(sess)
discriminator = Discriminator(sess, env.observation_space.shape, env.action_space.shape, 10, 0.01, 'D')
sess.run(tf.global_variables_initializer())
tf.reset_default_graph()

# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)
        
# Create and wrap the environmentc
custom_env = CustomEnv(env, discriminator, 1)
custom_env = Monitor(custom_env, log_dir, allow_early_resets=True)
custom_env = DummyVecEnv(([lambda: custom_env]))
data = np.load('expert_traj/ddpg_expert_5zonemix.npz')
expert_obs = data['obs']
expert_actions = data['actions']
expert_ob_ac = np.concatenate([expert_obs, expert_actions], axis = 1)

model3 = PPO2(MlpPolicy, custom_env, verbose=0)

ts = 0
batch_ts = len(expert_ob_ac)
total_ts = batch_ts * 50
lamb = 1

rew_sum = []
### RL only training
while ts < total_ts:
    model3.set_env(custom_env)
    model3.learn(batch_ts)
    model3.save("agent_rl_only")
    ts += batch_ts
    
    obs = custom_env.reset()[0]
    observes, actions, rewards = [],[],[]
    
    i = 0
    while i < len(expert_ob_ac):
        observes.append(obs)
        action, _states = model3.predict(obs)
        obs, reward, done, info = env.step(action)
        actions.append(action)
        rewards.append(reward)
        if done:
            obs = custom_env.reset()[0]
            rew_sum.append(np.sum(rewards))
            rewards = []
        i += 1
        
    policy_ob_ac = np.concatenate([observes, actions],1)
    for k in range(5):
        discriminator.update(np.concatenate([expert_ob_ac, policy_ob_ac], axis=0))
    custom_env = CustomEnv(env, discriminator, lamb)
    custom_env = DummyVecEnv(([lambda: custom_env]))
    model3.load("agent_rl_only")
    print('Timesteps: ' + str(ts) + ' Rewards Sum: ' + str(rew_sum[-1]))
print(rew_sum)

##  Baselines

### TRPO 

In [None]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import TRPO
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

env = gym.make('Eplus-5Zone-mixed-continuous-v1')
config = tf.ConfigProto(
device_count={'GPU': 1},
intra_op_parallelism_threads=1,
allow_soft_placement=True
)
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
tf.keras.backend.set_session(sess)
discriminator = Discriminator(sess, env.observation_space.shape, env.action_space.shape, 10, 0.01, 'D')
sess.run(tf.global_variables_initializer())
tf.reset_default_graph()

# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)
        
# Create and wrap the environmentc
custom_env = CustomEnv(env, discriminator, 1)
custom_env = Monitor(custom_env, log_dir, allow_early_resets=True)
custom_env = DummyVecEnv(([lambda: custom_env]))
data = np.load('expert_traj/ddpg_expert_5zonemix.npz')
expert_obs = data['obs']
expert_actions = data['actions']
expert_ob_ac = np.concatenate([expert_obs, expert_actions], axis = 1)

model4 = TRPO(MlpPolicy, custom_env, verbose=0)

ts = 0
batch_ts = len(expert_ob_ac)
total_ts = batch_ts * 50
lamb = 1

rew_sum = []
### RL only training
while ts < total_ts:
    model4.set_env(custom_env)
    model4.learn(batch_ts)
    model4.save("agent_rl_trpo_only")
    ts += batch_ts
    
    obs = custom_env.reset()[0]
    observes, actions, rewards = [],[],[]
    
    i = 0
    while i < len(expert_ob_ac):
        observes.append(obs)
        action, _states = model4.predict(obs)
        obs, reward, done, info = env.step(action)
        actions.append(action)
        rewards.append(reward)
        if done:
            obs = custom_env.reset()[0]
            rew_sum.append(np.sum(rewards))
            rewards = []
        i += 1
        
    policy_ob_ac = np.concatenate([observes, actions],1)
    for k in range(5):
        discriminator.update(np.concatenate([expert_ob_ac, policy_ob_ac], axis=0))
    custom_env = CustomEnv(env, discriminator, lamb)
    custom_env = DummyVecEnv(([lambda: custom_env]))
    model4.load("agent_rl_trpo_only")
    print('Timesteps: ' + str(ts) + ' Rewards Sum: ' + str(rew_sum[-1]))
print(rew_sum)

### A2C 

In [None]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import A2C
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

env = gym.make('Eplus-5Zone-mixed-continuous-v1')
config = tf.ConfigProto(
device_count={'GPU': 1},
intra_op_parallelism_threads=1,
allow_soft_placement=True
)
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
tf.keras.backend.set_session(sess)
discriminator = Discriminator(sess, env.observation_space.shape, env.action_space.shape, 10, 0.01, 'D')
sess.run(tf.global_variables_initializer())
tf.reset_default_graph()

# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)
        
# Create and wrap the environmentc
custom_env = CustomEnv(env, discriminator, 1)
custom_env = Monitor(custom_env, log_dir, allow_early_resets=True)
custom_env = DummyVecEnv(([lambda: custom_env]))
data = np.load('expert_traj/ddpg_expert_5zonemix.npz')
expert_obs = data['obs']
expert_actions = data['actions']
expert_ob_ac = np.concatenate([expert_obs, expert_actions], axis = 1)

model5 = A2C(MlpPolicy, custom_env, verbose=0)

ts = 0
batch_ts = len(expert_ob_ac)
total_ts = batch_ts * 50
lamb = 1

rew_sum = []
### RL only training
while ts < total_ts:
    model5.set_env(custom_env)
    model5.learn(batch_ts)
    model5.save("agent_rl_a2c_only")
    ts += batch_ts
    
    obs = custom_env.reset()[0]
    observes, actions, rewards = [],[],[]
    
    i = 0
    while i < len(expert_ob_ac):
        observes.append(obs)
        action, _states = model5.predict(obs)
        obs, reward, done, info = env.step(action)
        actions.append(action)
        rewards.append(reward)
        if done:
            obs = custom_env.reset()[0]
            rew_sum.append(np.sum(rewards))
            rewards = []
        i += 1
        
    policy_ob_ac = np.concatenate([observes, actions],1)
    for k in range(5):
        discriminator.update(np.concatenate([expert_ob_ac, policy_ob_ac], axis=0))
    custom_env = CustomEnv(env, discriminator, lamb)
    custom_env = DummyVecEnv(([lambda: custom_env]))
    model5.load("agent_rl_a2c_only")
    print('Timesteps: ' + str(ts) + ' Rewards Sum: ' + str(rew_sum[-1]))
print(rew_sum)

### DDPG 

In [None]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import DDPG
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

env = gym.make('Eplus-5Zone-mixed-continuous-v1')
config = tf.ConfigProto(
device_count={'GPU': 1},
intra_op_parallelism_threads=1,
allow_soft_placement=True
)
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
tf.keras.backend.set_session(sess)
discriminator = Discriminator(sess, env.observation_space.shape, env.action_space.shape, 10, 0.01, 'D')
sess.run(tf.global_variables_initializer())
tf.reset_default_graph()

# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)
        
# Create and wrap the environmentc
custom_env = CustomEnv(env, discriminator, 1)
custom_env = Monitor(custom_env, log_dir, allow_early_resets=True)
custom_env = DummyVecEnv(([lambda: custom_env]))
data = np.load('expert_traj/ddpg_expert_5zonemix.npz')
expert_obs = data['obs']
expert_actions = data['actions']
expert_ob_ac = np.concatenate([expert_obs, expert_actions], axis = 1)

# the noise objects for DDPG
n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions))

# Generate expert trajectories (train expert)
model6 = DDPG('MlpPolicy', custom_env, param_noise=param_noise, action_noise=action_noise, verbose=0)

ts = 0
batch_ts = len(expert_ob_ac)
total_ts = batch_ts * 50
lamb = 1

rew_sum = []
### RL only training
while ts < total_ts:
    model6.set_env(custom_env)
    model6.learn(batch_ts)
    model6.save("agent_rl_ddpg_only")
    ts += batch_ts
    
    obs = custom_env.reset()[0]
    observes, actions, rewards = [],[],[]
    
    i = 0
    while i < len(expert_ob_ac):
        observes.append(obs)
        action, _states = model2.predict(obs)
        obs, reward, done, info = env.step(action)
        actions.append(action)
        rewards.append(reward)
        if done:
            obs = custom_env.reset()[0]
            rew_sum.append(np.sum(rewards))
            rewards = []
        i += 1
        
    policy_ob_ac = np.concatenate([observes, actions],1)
    for k in range(5):
        discriminator.update(np.concatenate([expert_ob_ac, policy_ob_ac], axis=0))
    custom_env = CustomEnv(env, discriminator, lamb)
    custom_env = DummyVecEnv(([lambda: custom_env]))
    model6.load("agent_rl_ddpg_only")
    print('Timesteps: ' + str(ts) + ' Rewards Sum: ' + str(rew_sum[-1]))
print(rew_sum)

## Rule-based 

In [None]:
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import TRPO
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
from gym import spaces

# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)
        
# Create and wrap the environmentc
env = gym.make('Eplus-5Zone-mixed-continuous-v1')
data = np.load('expert_traj/ddpg_expert_5zonemix.npz')
expert_obs = data['obs']
expert_actions = data['actions']
expert_ob_ac = np.concatenate([expert_obs, expert_actions], axis = 1)

config = tf.ConfigProto(
device_count={'GPU': 1},
intra_op_parallelism_threads=1,
allow_soft_placement=True
)
config.gpu_options.allow_growth = True
sess = tf.InteractiveSession(config=config)
tf.keras.backend.set_session(sess)
sess.run(tf.global_variables_initializer())
tf.reset_default_graph()


obs = env.reset()[0]
observes, actions, rewards = [],[],[]
i = 0
while i < len(expert_ob_ac):
    observes.append(obs)
    action = np.asarray([20, 26])
    obs, reward, done, info = env.step(action)
    actions.append(action)
    rewards.append(reward)
    i += 1
print(' Rewards Sum: ' + str(np.sum(rewards)))

## Generate expert trajectories

In [None]:
import gym
import sinergym
import numpy as np
from stable_baselines import SAC, PPO2, TRPO, TD3, DDPG
from stable_baselines.gail import generate_expert_traj
from stable_baselines.ddpg.noise import NormalActionNoise, OrnsteinUhlenbeckActionNoise

env = gym.make('Eplus-5Zone-mixed-continuous-v1')

# the noise objects for DDPG
n_actions = env.action_space.shape[-1]
param_noise = None
action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.1) * np.ones(n_actions))

# Generate expert trajectories (train expert)
model = DDPG('MlpPolicy', env, param_noise=param_noise, action_noise=action_noise, verbose=1)
# Train for n_timesteps timesteps and record n_episodes trajectories
# all the data will be saved in '*.npz' file
generate_expert_traj(model, 'ddpg_expert_5zonemix', n_timesteps=1000000, n_episodes=2)

## Calculate statistics 

In [None]:
from stable_baselines.common.vec_env import DummyVecEnv, VecNormalize
import matplotlib.pyplot as plt

# Create log dir
log_dir = "tmp/"
os.makedirs(log_dir, exist_ok=True)

env = gym.make('Eplus-5Zone-mixed-continuous-v1')
env = Monitor(env, log_dir, allow_early_resets=True)
env = DummyVecEnv(([lambda: env]))
obs = env.reset()
i = 0
violation = 0
total_timesteps = 24*31*3
temp_list = []
total_power = 0
while i < total_timesteps:
    action, _states = model5.predict(obs)
    obs, reward, done, info = env.step(action)
    temp = obs[0][8]
    temp_list.append(temp)
    total_power += info[0]['total_power']
    if temp > 23.5001 or temp < 19.999:
        violation += 1
    i += 1
print(violation/total_timesteps)
print(total_power/(31*24))

fig = plt.figure(figsize=(10,3))
ax = fig.add_subplot(111)
int_list = list(range(1,32))
string_label_list = [str(x) for x in int_list]
ax.set_xticks(list(range(0,24*31*3,24*3)))
ax.set_xticklabels(string_label_list)
ax.plot(np.arange(total_timesteps), temp_list)
print(temp_list)