In [1]:
import sys
sys.path.insert(1, './gym_powertac')

from powertac_wm import PowerTAC_WM
import gym
import datetime
import json
import numpy as np
import pandas as pd

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

from ReplayBuffer import ReplayBuffer
from ActorNetwork import ActorNetwork
from CriticNetwork import CriticNetwork
from OU import OU

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
model_storage_path = "ddpg_v1.0" 

In [3]:
BUFFER_SIZE = 100000
BATCH_SIZE = 64
EPOCHS = 10000
GAMMA = 0.99
TAU = 0.001     #Target Network HyperParameters
LRA = 0.0001    #Learning rate for Actor
LRC = 0.001     #Lerning rate for Critic

'''
Action : [
           limitprice1 belongs to R
           limitprice2 belongs to R
         ] (TWO output nuerons)
'''
action_dim = 2

'''
State : [
          Proximity (24)
          Required_Quantity (1)
          Mean Market Price (1)
        ] (total 26 input nuerons)
'''
state_dim = 26

np.random.seed(1337)
EXPLORE = 100000.0

step = 0
epsilon = 1

ou = OU()       #Ornstein-Uhlenbeck Process

In [4]:
config = tf.ConfigProto(
    device_count={'GPU': 1},
    intra_op_parallelism_threads=1,
    allow_soft_placement=True
)

config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.6

session = tf.Session(config=config)

In [5]:
actor = ActorNetwork(session, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
critic = CriticNetwork(session, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
replay_buffer = pd.read_csv('replay_buffer_4_normal.csv', header=None)


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 26)]              0         
_________________________________________________________________
dense (Dense)                (None, 400)               10800     
_________________________________________________________________
dense_1 (Dense)              (None, 300)               120300    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 602       
_________________________________________________________________
concatenate (Concatenate)    (None, 2)                 0         
Total params: 131,702
Trainable params: 131,702
Non-trainable params: 0
_________________________________________________________________
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output S

In [6]:
replay_buffer.head

<bound method NDFrame.head of          0    1    2    3    4    5    6    7    8    9   ...   46   47   48  \
0       1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
1       0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
2       0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
3       0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
4       0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
...     ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   
190805  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  1.0  0.0   
190806  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  1.0   
190807  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
190808  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   
190809  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

         

In [7]:
def train_ddpg_network():

    for epoch in range(EPOCHS):

        print("Epoch ", (epoch+1))
        print("-"*12)
        loss = 0

        #Do the batch update
        batch = replay_buffer.sample(n=BATCH_SIZE)
        states = np.asarray(batch[batch.columns[0:26]])
        actions = np.asarray(batch[batch.columns[26:28]])
        rewards = np.asarray(batch[batch.columns[28:29]])
        new_states = np.asarray(batch[batch.columns[29:55]])
        terminals = np.asarray(batch[batch.columns[55:56]])

        y_t = np.zeros([BATCH_SIZE,1])

        print("States", states.shape)
        print("Actions", actions.shape)
        print("Rewards", rewards.shape)
        print("New_States", new_states.shape)

        with session.as_default():
            with session.graph.as_default():

                target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])
                print("Target_Q_Values", target_q_values.shape)

                for k in range(BATCH_SIZE):
                    if terminals[k] == 1:
                        y_t[k] = rewards[k]
                    else:
                        y_t[k] = rewards[k] + GAMMA*target_q_values[k]

                # print("Bellman Rewards", y_t)
                loss += critic.model.train_on_batch([states,actions], y_t)
                print("Loss", loss)
                a_for_grad = actor.model.predict(states)      # This may not be required, a_for_grad should be replaced by actions ##### Check PENDING #####
                # print("a_for_grad", a_for_grad)
                grads = critic.gradients(states, a_for_grad)       # a_for_grad is replaced by actions ##### Check PENDING #####   shape ERROR 
                # print("grads", grads)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

    print("Training Completed !!!")

In [8]:
train_ddpg_network()

Epoch  1
------------
States (32, 26)
Actions (32, 2)
Rewards (32, 1)
New_States (32, 26)
Target_Q_Values (32, 1)




Loss 0.13265274465084076
Epoch  2
------------
States (32, 26)
Actions (32, 2)
Rewards (32, 1)
New_States (32, 26)
Target_Q_Values (32, 1)
Loss 0.6273709535598755
Epoch  3
------------
States (32, 26)
Actions (32, 2)
Rewards (32, 1)
New_States (32, 26)
Target_Q_Values (32, 1)
Loss 0.18580296635627747
Epoch  4
------------
States (32, 26)
Actions (32, 2)
Rewards (32, 1)
New_States (32, 26)
Target_Q_Values (32, 1)
Loss 0.04930056631565094
Epoch  5
------------
States (32, 26)
Actions (32, 2)
Rewards (32, 1)
New_States (32, 26)
Target_Q_Values (32, 1)
Loss 0.23186320066452026
Training Completed !!!


In [None]:
def choose_Action(states):

        actions = list()

        for state in states:

            try:

                # self.epsilon -= 1.0 / self.EXPLORE
                # a_t = np.zeros([self.action_dim])
                # noise_t = np.zeros([self.action_dim])

                with session.as_default():
                    with session.graph.as_default():

                        a_t_original = actor.model.predict(state.reshape(1, state.shape[0]))[0].tolist()
                        # noise_t[0] = max(self.epsilon, 0) * self.ou.function(a_t_original[0],  0.0 , 0.60, 0.30)  # decide theta, sigma and mu for limitprice

                        # a_t[0] = a_t_original[0] + noise_t[0]
                        # a_t[1] = a_t_original[1] + noise_t[1]

                        # print(a_t_original)
                        actions.append(list(a_t_original))

            except Exception as e:
                print(e)

        return actions

In [None]:
batch = replay_buffer.sample(n=BATCH_SIZE)
states = np.asarray(batch[batch.columns[0:25]])

lps = choose_Action(states)

In [None]:
for lp in lps:
    print(lp)

In [None]:
def save_models():

        with session.as_default():
                with session.graph.as_default():        

                    timestamp = int(datetime.datetime.now().timestamp())

                    actor.model.save_weights(model_storage_path + "/actormodel.h5", overwrite=True)
                    with open(model_storage_path + "/actormodel.json", "w") as outfile:
                        json.dump(actor.model.to_json(), outfile)

                    critic.model.save_weights(model_storage_path + "/criticmodel.h5", overwrite=True)
                    with open(model_storage_path + "/criticmodel.json", "w") as outfile:
                        json.dump(critic.model.to_json(), outfile)

                    actor.target_model.save_weights(model_storage_path + "/actortargetmodel.h5", overwrite=True)
                    with open(model_storage_path + "/actormodeltarget.json", "w") as outfile:
                        json.dump(actor.target_model.to_json(), outfile)

                    critic.target_model.save_weights(model_storage_path + "/critictargetmodel.h5", overwrite=True)
                    with open(model_storage_path + "/criticmodeltarget.json", "w") as outfile:
                        json.dump(critic.target_model.to_json(), outfile)

                    print("Models Saved Successfully !!!")


In [None]:
save_models()