In [1]:
import sys
import time
sys.path.insert(1, './gym_powertac')

from powertac_wm import PowerTAC_WM
import gym
import datetime
import json
import numpy as np
import pandas as pd

import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

from ReplayBuffer import ReplayBuffer
from ActorNetwork import ActorNetwork
from CriticNetwork import CriticNetwork
from OU import OU

from sklearn import preprocessing
from matplotlib import pyplot as plt
%matplotlib notebook

Instructions for updating:
non-resource variables are not supported in the long term


In [2]:
model_storage_path = "ddpg_v0.0" 

In [3]:
BATCH_SIZE = 64
EPOCHS = 1000
GAMMA = 0.99
TAU = 0.001     #Target Network HyperParameters
LRA = 0.0001    #Learning rate for Actor
LRC = 0.001     #Lerning rate for Critic

'''
Action : [
           limitprice belongs to R
         ] (one output nuerons)
'''
action_dim = 1

'''
State : [
          Proximity (24)
          Required_Quantity (1)
        ] (total 25 input nuerons)
'''
state_dim = 25

np.random.seed(1337)
EXPLORE = 100000.0

step = 0
epsilon = 1

ou = OU()       #Ornstein-Uhlenbeck Process

In [4]:
config = tf.ConfigProto(
    device_count={'GPU': 1},
    intra_op_parallelism_threads=1,
    allow_soft_placement=True
)

config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.6

session = tf.Session(config=config)

In [5]:
actor = ActorNetwork(session, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
critic = CriticNetwork(session, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
replay_buffer = pd.read_csv('replay_buffer_big.csv', header=None)


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 25)]              0         
_________________________________________________________________
dense (Dense)                (None, 400)               10400     
_________________________________________________________________
dense_1 (Dense)              (None, 300)               120300    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 301       
_________________________________________________________________
concatenate (Concatenate)    (None, 1)                 0         
Total params: 131,001
Trainable params: 131,001
Non-trainable params: 0
_________________________________________________________________
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output S

In [6]:
replay_buffer.shape

(166987, 53)

In [7]:
# scaler_robust = preprocessing.RobustScaler(quantile_range=(15.0, 85.0))
# robust_replay_buffer = scaler_robust.fit_transform(replay_buffer)

# scaler_minmax = preprocessing.MinMaxScaler()
# minmax_replay_buffer = scaler_minmax.fit_transform(robust_replay_buffer)

In [8]:
standard_replay_buffer = replay_buffer
scaler_standard = preprocessing.StandardScaler()
standard_replay_buffer[standard_replay_buffer.columns[[24, 25, 26, 51]]] = \
scaler_standard.fit_transform(standard_replay_buffer[standard_replay_buffer.columns[[24, 25, 26, 51]]])
standard_replay_buffer = np.asarray(standard_replay_buffer)

In [9]:
def train_ddpg_network():

    epoch_list = list()
    loss_list = list()
    
    start = time.time()
    
    for epoch in range(EPOCHS):

        print("Epoch ", (epoch+1))
        print("-"*12)
        loss = 0

        #Do the batch update
        batch = standard_replay_buffer[np.random.randint(standard_replay_buffer.shape[0], size=BATCH_SIZE), :]
        states = batch[:, 0:25]
        actions = batch[:, 25:26]
        rewards = batch[:, 26:27]
        new_states = batch[:, 27:52]
        terminals = batch[:, 52:53]

        y_t = np.zeros([BATCH_SIZE,1])

#         print("States", states.shape)
#         print("Actions", actions.shape)
#         print("Rewards", rewards.shape)
#         print("New_States", new_states.shape)

        with session.as_default():
            with session.graph.as_default():

                target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])
                # print("Target_Q_Values", target_q_values.shape)

                for k in range(BATCH_SIZE):
                    if terminals[k] == 1:
                        y_t[k] = rewards[k]
                    else:
                        y_t[k] = rewards[k] + GAMMA*target_q_values[k]

                # print("Bellman Rewards", y_t)
                loss += critic.model.train_on_batch([states,actions], y_t)
                print("Loss", loss)
                epoch_list.append(epoch)
                loss_list.append(loss)
                a_for_grad = actor.model.predict(states)      # This may not be required, a_for_grad should be replaced by actions ##### Check PENDING #####
                # print("a_for_grad", a_for_grad)
                grads = critic.gradients(states, a_for_grad)       # a_for_grad is replaced by actions ##### Check PENDING #####   shape ERROR 
                # print("grads", grads)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

    end = time.time()
    print("Training Completed in {} seconds!!!".format(end-start))
    return epoch_list, loss_list

In [10]:
def choose_Action(states):

        actions = list()

        for state in states:

            try:

                # self.epsilon -= 1.0 / self.EXPLORE
                # a_t = np.zeros([self.action_dim])
                # noise_t = np.zeros([self.action_dim])

                with session.as_default():
                    with session.graph.as_default():

                        a_t_original = actor.model.predict(state.reshape(1, state.shape[0]))[0].tolist()
                        # noise_t[0] = max(self.epsilon, 0) * self.ou.function(a_t_original[0],  0.0 , 0.60, 0.30)  # decide theta, sigma and mu for limitprice

                        # a_t[0] = a_t_original[0] + noise_t[0]
                        # a_t[1] = a_t_original[1] + noise_t[1]

                        # print(a_t_original)
                        actions.append(list(a_t_original))

            except Exception as e:
                print(e)

        return actions

In [11]:
def save_models():

        with session.as_default():
                with session.graph.as_default():        

                    timestamp = int(datetime.datetime.now().timestamp())

                    actor.model.save_weights(model_storage_path + "/actormodel.h5", overwrite=True)
                    with open(model_storage_path + "/actormodel.json", "w") as outfile:
                        json.dump(actor.model.to_json(), outfile)

                    critic.model.save_weights(model_storage_path + "/criticmodel.h5", overwrite=True)
                    with open(model_storage_path + "/criticmodel.json", "w") as outfile:
                        json.dump(critic.model.to_json(), outfile)

                    actor.target_model.save_weights(model_storage_path + "/actortargetmodel.h5", overwrite=True)
                    with open(model_storage_path + "/actormodeltarget.json", "w") as outfile:
                        json.dump(actor.target_model.to_json(), outfile)

                    critic.target_model.save_weights(model_storage_path + "/critictargetmodel.h5", overwrite=True)
                    with open(model_storage_path + "/criticmodeltarget.json", "w") as outfile:
                        json.dump(critic.target_model.to_json(), outfile)

                    print("Models Saved Successfully !!!")


In [12]:
epoch_list, loss_list = train_ddpg_network()

Epoch  1
------------




Loss 0.3111554682254791
Epoch  2
------------
Loss 0.15867307782173157
Epoch  3
------------
Loss 0.4277224540710449
Epoch  4
------------
Loss 0.6207340359687805
Epoch  5
------------
Loss 2.070405960083008
Epoch  6
------------
Loss 0.2100202441215515
Epoch  7
------------
Loss 0.22793734073638916
Epoch  8
------------
Loss 3.386211395263672
Epoch  9
------------
Loss 0.3099098205566406
Epoch  10
------------
Loss 0.23517154157161713
Epoch  11
------------
Loss 0.30381494760513306
Epoch  12
------------
Loss 0.6238116025924683
Epoch  13
------------
Loss 0.12479469925165176
Epoch  14
------------
Loss 2.180464267730713
Epoch  15
------------
Loss 0.3014506995677948
Epoch  16
------------
Loss 0.6192029714584351
Epoch  17
------------
Loss 0.4792523682117462
Epoch  18
------------
Loss 0.4355328381061554
Epoch  19
------------
Loss 0.23651981353759766
Epoch  20
------------
Loss 0.38488292694091797
Epoch  21
------------
Loss 0.2386743426322937
Epoch  22
------------
Loss 0.3979825973

Epoch  174
------------
Loss 0.19863158464431763
Epoch  175
------------
Loss 0.13305261731147766
Epoch  176
------------
Loss 0.34714922308921814
Epoch  177
------------
Loss 0.2647610306739807
Epoch  178
------------
Loss 1.0812830924987793
Epoch  179
------------
Loss 0.2795872390270233
Epoch  180
------------
Loss 1.502698302268982
Epoch  181
------------
Loss 0.26199790835380554
Epoch  182
------------
Loss 1.1468106508255005
Epoch  183
------------
Loss 0.320489764213562
Epoch  184
------------
Loss 0.09949745237827301
Epoch  185
------------
Loss 0.446553498506546
Epoch  186
------------
Loss 0.5166959762573242
Epoch  187
------------
Loss 0.4070974588394165
Epoch  188
------------
Loss 0.11185409128665924
Epoch  189
------------
Loss 0.29403993487358093
Epoch  190
------------
Loss 0.4319054186344147
Epoch  191
------------
Loss 0.39477303624153137
Epoch  192
------------
Loss 0.7031095027923584
Epoch  193
------------
Loss 0.26564157009124756
Epoch  194
------------
Loss 0.264

Loss 0.21461370587348938
Epoch  350
------------
Loss 0.08079053461551666
Epoch  351
------------
Loss 0.19984889030456543
Epoch  352
------------
Loss 0.18941441178321838
Epoch  353
------------
Loss 0.1125742644071579
Epoch  354
------------
Loss 5.8687849044799805
Epoch  355
------------
Loss 0.3541736602783203
Epoch  356
------------
Loss 2.8715052604675293
Epoch  357
------------
Loss 0.14649704098701477
Epoch  358
------------
Loss 0.31270647048950195
Epoch  359
------------
Loss 0.8310509324073792
Epoch  360
------------
Loss 0.36560487747192383
Epoch  361
------------
Loss 0.4988681674003601
Epoch  362
------------
Loss 0.29078176617622375
Epoch  363
------------
Loss 0.12670180201530457
Epoch  364
------------
Loss 0.2657143175601959
Epoch  365
------------
Loss 0.6066574454307556
Epoch  366
------------
Loss 2.1050634384155273
Epoch  367
------------
Loss 0.05124688148498535
Epoch  368
------------
Loss 0.18209098279476166
Epoch  369
------------
Loss 0.4440533518791199
Epoch

Loss 1.3447611331939697
Epoch  519
------------
Loss 0.12362032383680344
Epoch  520
------------
Loss 0.1713598519563675
Epoch  521
------------
Loss 0.1313333362340927
Epoch  522
------------
Loss 0.19222547113895416
Epoch  523
------------
Loss 0.4047573208808899
Epoch  524
------------
Loss 0.49853515625
Epoch  525
------------
Loss 0.15362900495529175
Epoch  526
------------
Loss 0.13417068123817444
Epoch  527
------------
Loss 0.17510591447353363
Epoch  528
------------
Loss 0.1651243269443512
Epoch  529
------------
Loss 0.34908851981163025
Epoch  530
------------
Loss 2.812795400619507
Epoch  531
------------
Loss 0.1567145735025406
Epoch  532
------------
Loss 0.12899407744407654
Epoch  533
------------
Loss 0.09656067192554474
Epoch  534
------------
Loss 1.430200219154358
Epoch  535
------------
Loss 0.21564261615276337
Epoch  536
------------
Loss 0.08160051703453064
Epoch  537
------------
Loss 6.5515336990356445
Epoch  538
------------
Loss 0.32313936948776245
Epoch  539
-

Loss 0.34038880467414856
Epoch  689
------------
Loss 0.18470895290374756
Epoch  690
------------
Loss 0.1296493411064148
Epoch  691
------------
Loss 0.3070521652698517
Epoch  692
------------
Loss 0.27869608998298645
Epoch  693
------------
Loss 0.19353854656219482
Epoch  694
------------
Loss 0.5416972637176514
Epoch  695
------------
Loss 0.4953286051750183
Epoch  696
------------
Loss 0.07096809148788452
Epoch  697
------------
Loss 0.28264519572257996
Epoch  698
------------
Loss 0.27692288160324097
Epoch  699
------------
Loss 0.17841637134552002
Epoch  700
------------
Loss 0.16077634692192078
Epoch  701
------------
Loss 0.07245775312185287
Epoch  702
------------
Loss 0.07059796154499054
Epoch  703
------------
Loss 0.27330559492111206
Epoch  704
------------
Loss 0.24077308177947998
Epoch  705
------------
Loss 0.09891823679208755
Epoch  706
------------
Loss 4.188687324523926
Epoch  707
------------
Loss 0.014553818851709366
Epoch  708
------------
Loss 0.7615031003952026
E

Epoch  863
------------
Loss 1.5326100587844849
Epoch  864
------------
Loss 0.42358124256134033
Epoch  865
------------
Loss 0.2351755052804947
Epoch  866
------------
Loss 0.21224141120910645
Epoch  867
------------
Loss 0.18080854415893555
Epoch  868
------------
Loss 0.24248656630516052
Epoch  869
------------
Loss 0.16771450638771057
Epoch  870
------------
Loss 0.4545997083187103
Epoch  871
------------
Loss 0.09158407896757126
Epoch  872
------------
Loss 0.09368880093097687
Epoch  873
------------
Loss 0.24783079326152802
Epoch  874
------------
Loss 0.18188107013702393
Epoch  875
------------
Loss 0.44844406843185425
Epoch  876
------------
Loss 0.2608586549758911
Epoch  877
------------
Loss 1.808258056640625
Epoch  878
------------
Loss 0.2322089672088623
Epoch  879
------------
Loss 0.189533531665802
Epoch  880
------------
Loss 0.40743234753608704
Epoch  881
------------
Loss 1.4013537168502808
Epoch  882
------------
Loss 0.3161509931087494
Epoch  883
------------
Loss 0.

In [13]:
batch = standard_replay_buffer[np.random.randint(standard_replay_buffer.shape[0], size=10), :]
states = batch[:, 0:25]
lps = choose_Action(states)

In [14]:
for lp in lps:
    print(lp)

[0.982548713684082]
[0.9938387274742126]
[0.9893948435783386]
[0.9998452067375183]
[0.9778561592102051]
[0.9833648204803467]
[0.9986892342567444]
[0.9892503023147583]
[0.9813749194145203]
[0.9946772456169128]
