In [None]:
import sys

import json
import datetime
import numpy as np
import pandas as pd

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

from ReplayBuffer import ReplayBuffer
from ActorNetwork import ActorNetwork
from CriticNetwork import CriticNetwork
from OU import OU

In [None]:
model_storage_path = "ddpg_v1.2" 

In [None]:
BUFFER_SIZE = 100000
BATCH_SIZE = 64
EPOCHS = 10000
GAMMA = 0.99
TAU = 0.001     #Target Network HyperParameters
LRA = 0.0001    #Learning rate for Actor
LRC = 0.001     #Lerning rate for Critic

'''
Action : [
           limitprice1 scaler belongs to [0,1]
           limitprice2 scaler belongs to [0,1]
         ] (Two output nuerons)
'''
action_dim = 2

'''
State : [
          Proximity (1)
          Balancing_Price (1)
          Required_Quantity (1)
        ] 
'''
state_dim = 3

np.random.seed(1337)
EXPLORE = 100000.0

step = 0
epsilon = 1

ou = OU()       #Ornstein-Uhlenbeck Process

In [None]:
config = tf.ConfigProto(
    device_count={'GPU': 1},
    intra_op_parallelism_threads=1,
    allow_soft_placement=True
)

config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.6

session = tf.Session(config=config)

In [None]:
actor = ActorNetwork(session, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
critic = CriticNetwork(session, state_dim, action_dim, BATCH_SIZE, TAU, LRC)

data_storage_path = "/mnt/d/PowerTAC/PowerTAC2021/experiments_scripts/powertac_simulator_py/ddpg_based_wholesale_strategy_powertac/"
replay_buffer = pd.read_csv(data_storage_path + 'replay_buffer.csv', header=None)

In [None]:
def train_ddpg_network():

    for epoch in range(EPOCHS):

        print("Epoch ", (epoch+1))
        print("-"*12)
        loss = 0

        #Do the batch update
        batch = replay_buffer.sample(n=BATCH_SIZE)
        states = np.asarray(batch[batch.columns[0:3]])
        actions = np.asarray(batch[batch.columns[3:5]])
        rewards = np.asarray(batch[batch.columns[5:6]])
        new_states = np.asarray(batch[batch.columns[6:9]])
        terminals = np.asanyarray(batch[batch.columns[9:10]])

        y_t = np.zeros([BATCH_SIZE,1])

        # print("States", states.shape)
        # print("Actions", actions.shape)
        # print("Rewards", rewards.shape)
        # print("New_States", new_states.shape)

        with session.as_default():
            with session.graph.as_default():
                
                target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])
                # print("Target_Q_Values", target_q_values.shape)

                for k in range(BATCH_SIZE):
                    if terminals[k] == 1:
                        y_t[k] = rewards[k]
                    else:
                        y_t[k] = rewards[k] + GAMMA*target_q_values[k]    # check by keeping negative sign here

                # print("Bellman Rewards", y_t)
                loss += critic.model.train_on_batch([states,actions], y_t)
                print("Loss", loss)
                a_for_grad = actor.model.predict(states)      # This may not be required, a_for_grad should be replaced by actions ##### Check PENDING #####
                # print("a_for_grad", a_for_grad)
                grads = critic.gradients(states, a_for_grad)       # a_for_grad is replaced by actions ##### Check PENDING #####   shape ERROR 
                # print("grads", grads)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

    print("Training Completed !!!")

In [None]:
train_ddpg_network()

In [None]:
def choose_Action(states):

        actions = list()

        for state in states:

            try:

                # self.epsilon -= 1.0 / self.EXPLORE
                # a_t = np.zeros([self.action_dim])
                # noise_t = np.zeros([self.action_dim])

                with session.as_default():
                    with session.graph.as_default():

                        a_t_original = actor.model.predict(state.reshape(1, state.shape[0]))[0].tolist()
                        # noise_t[0] = max(self.epsilon, 0) * self.ou.function(a_t_original[0],  0.0 , 0.60, 0.30)  # decide theta, sigma and mu for limitprice

                        # a_t[0] = a_t_original[0] + noise_t[0]
                        # a_t[1] = a_t_original[1] + noise_t[1]

                        # print(a_t_original)
                        actions.append(list(a_t_original))

            except Exception as e:
                print(e)

        return actions

In [None]:
batch = replay_buffer.sample(n=10)
states = np.asarray(batch[batch.columns[0:3]])

lps = choose_Action(states)

for lp in lps:
    print(lp)

In [None]:
def save_models():

        with session.as_default():
                with session.graph.as_default():        

                    timestamp = int(datetime.datetime.now().timestamp())

                    actor.model.save_weights(model_storage_path + "/actormodel.h5", overwrite=True)
                    with open(model_storage_path + "/actormodel.json", "w") as outfile:
                        json.dump(actor.model.to_json(), outfile)

                    critic.model.save_weights(model_storage_path + "/criticmodel.h5", overwrite=True)
                    with open(model_storage_path + "/criticmodel.json", "w") as outfile:
                        json.dump(critic.model.to_json(), outfile)

                    actor.target_model.save_weights(model_storage_path + "/actortargetmodel.h5", overwrite=True)
                    with open(model_storage_path + "/actormodeltarget.json", "w") as outfile:
                        json.dump(actor.target_model.to_json(), outfile)

                    critic.target_model.save_weights(model_storage_path + "/critictargetmodel.h5", overwrite=True)
                    with open(model_storage_path + "/criticmodeltarget.json", "w") as outfile:
                        json.dump(critic.target_model.to_json(), outfile)

                    print("Models Saved Successfully !!!")


In [None]:
save_models()