In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

In [2]:
class DuelingDQN:
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.001,
            reward_decay=0.9,
            e_greedy=0.9,
            replace_target_iter=200,
            memory_size=500,
            batch_size=32,
            e_greedy_increment=None,
            output_graph=False,
            dueling=True,
            sess=None,
            number=3
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        self.dueling = dueling      # decide to use dueling DQN or not

        self.learn_step_counter = 0
        self.BS_number=number
        self.memory = np.zeros((self.memory_size, (n_features-2)*number*2+2))
        self._build_net()
        if sess is None:
            self.sess = tf.Session()
            self.sess.run(tf.global_variables_initializer())
        else:
            self.sess = sess
        if output_graph:
            tf.summary.FileWriter("logs/", self.sess.graph)
#         self.cost_his = []

    def _build_net(self):
        def build_layers(s, c_names, n_l1, w_initializer, b_initializer):
            with tf.variable_scope('l1'):
                w1 = tf.get_variable('w1', [self.n_features, n_l1], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)
                l1 = tf.nn.relu(tf.matmul(s, w1) + b1)

            if self.dueling:
                # Dueling DQN
                with tf.variable_scope('Value'):
                    w2 = tf.get_variable('w2', [n_l1, 1], initializer=w_initializer, collections=c_names)
                    b2 = tf.get_variable('b2', [1, 1], initializer=b_initializer, collections=c_names)
                    self.V = tf.matmul(l1, w2) + b2

                with tf.variable_scope('Advantage'):
                    w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
                    b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                    self.A = tf.matmul(l1, w2) + b2

                with tf.variable_scope('Q'):
                    out = self.V + (self.A - tf.reduce_mean(self.A, axis=1, keep_dims=True))     # Q = V(s) + A(s,a)
            else:
                with tf.variable_scope('Q'):
                    w2 = tf.get_variable('w2', [n_l1, self.n_actions], initializer=w_initializer, collections=c_names)
                    b2 = tf.get_variable('b2', [1, self.n_actions], initializer=b_initializer, collections=c_names)
                    out = tf.matmul(l1, w2) + b2
            print('build %s'%c_names)
            return out

        # ------------------ build evaluate_net ------------------
        self.s=[]
        self.q_target=[]
        self.q_eval=[]
        self.loss=[]
        self._train_op=[]
        self.s_=[]
        self.q_next=[]
        
        for i in range(self.BS_number):
            self.s.append(tf.placeholder(tf.float32, [None, self.n_features], name='s'+str(i)))
            self.q_target.append(tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target'+str(i)))
#         self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input
#         self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')  # for calculating loss
        for i in range(self.BS_number):
            print(i)
            with tf.variable_scope('eval_net'+str(i)):
                c_names, n_l1, w_initializer, b_initializer = \
                    ['eval_net_params'+str(i), tf.GraphKeys.GLOBAL_VARIABLES], 20, \
                    tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)  # config of layers

                self.q_eval.append(build_layers(self.s[i], c_names, n_l1, w_initializer, b_initializer))

            with tf.variable_scope('loss'+str(i)):
                self.loss.append(tf.reduce_mean(tf.squared_difference(self.q_target[i], self.q_eval[i])))
        for i in range(self.BS_number):
            with tf.variable_scope('train'+str(i)):
                self._train_op.append(tf.train.RMSPropOptimizer(self.lr).minimize(self.loss[i]))
        print('build evaluate net succeed')

        # ------------------ build target_net ------------------
        for i in range(self.BS_number):
            self.s_.append(tf.placeholder(tf.float32, [None, self.n_features], name='s_'+str(i))) 
            # input
        for i in range(self.BS_number):
            with tf.variable_scope('target_net'+str(i)):
                c_names = ['target_net_params'+str(i), tf.GraphKeys.GLOBAL_VARIABLES]
                print(w_initializer)

                self.q_next.append(build_layers(self.s_[i], c_names, n_l1, w_initializer, b_initializer))
            
            

            
#####以上是在建立N个神经网络

    def store_transition(self, s, a, r, s_):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        transition = np.hstack((s, [a, r], s_))
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition
        self.memory_counter += 1

    def choose_action(self, observation):
        action=[0,0,0]
        for i in range(self.BS_number):
            index=list(np.arange(i*(self.n_features-2),(i+1)*(self.n_features-2)))
            index.append(self.BS_number*(feature-2))
            index.append(self.BS_number*(feature-2)+1)
            observation_temp=observation[index]
            observation_temp=observation_temp[np.newaxis,:]
            if np.random.uniform() < self.epsilon:  # choosing action
                actions_value = self.sess.run(self.q_eval[i], feed_dict={self.s[i]: observation_temp})
                action[i] = np.argmax(actions_value)
            else:
                action[i] = np.random.randint(0, self.n_actions)
        action_count=''
        for i in range(self.BS_number):
            action_count+=str(int(action[i]))
        # print (action_count)
        action_count=int(action_count,2)
        return action

    def _replace_target_params(self):
        for i in range(self.BS_number):
            
            t_params = tf.get_collection('target_net_params'+str(i))
            e_params = tf.get_collection('eval_net_params'+str(i))
            self.sess.run([tf.assign(t, e) for t, e in zip(t_params, e_params)])

    def learn(self):
        if self.learn_step_counter % self.replace_target_iter == 0:
            self._replace_target_params()
            print('\ntarget_params_replaced\n')

        for i in range(self.BS_number):
            
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
            batch_memory = self.memory[sample_index, :]
            index_=list(np.arange((self.BS_number+i)*(self.n_features-2)+4,(self.BS_number+i+1)*(self.n_features-2)+4))
            index_.append(-2)
            index_.append(-1)
# feature=6
# y = np.arange((((feature-2)*3+2)*2+2)*10).reshape(10,(((feature-2)*3+2)*2+2))
# for i in range(3):
#     print('bs',i)
#     index=list(np.arange((3+i)*(feature-2)+4,(3+i)*(feature-2)+4+4))
#     index.append(-2)
#     index.append(-1)
#     print(y[:,index])
            observation_=batch_memory[:,index_]
            q_next, q_eval4next,  = self.sess.run(
                [self.q_next[i], self.q_eval[i]],
                feed_dict={self.s_: observation_,    # next observation
                           self.s: observation_})    # next observation
            index=list(np.arange(i*(self.n_features-2),(i+1)*(self.n_features-2)))
            index.append(self.BS_number*(feature-2))
            index.append(self.BS_number*(feature-2)+1)
#     index=list(np.arange(i*(feature-2),(i+1)*(feature-2)))
#     index.append(3*(feature-2))
#     index.append(3*(feature-2)+1)
            q_eval = self.sess.run(self.q_eval[i], {self.s[i]: batch_memory[:, index]})

            q_target = q_eval.copy()

            batch_index = np.arange(self.batch_size, dtype=np.int32)
            eval_act_index = batch_memory[:, (self.n_features-2)*self.BS_number*2].astype(int)
            reward = batch_memory[:, (self.n_features-2)*self.BS_number*2 + 1]

            q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)

            _, self.cost = self.sess.run([self._train_op[i], self.loss[i]],
                                     feed_dict={self.s[i]: batch_memory[:, index],
                                                self.q_target[i]: q_target})
#         self.cost_his.append(self.cost)

        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1




In [3]:
a=DuelingDQN(n_actions=2,n_features=10,output_graph=True)

0
build ['eval_net_params0', 'variables']
1
build ['eval_net_params1', 'variables']
2
build ['eval_net_params2', 'variables']
build evaluate net succeed
<tensorflow.python.ops.init_ops.RandomNormal object at 0x7f7d461c1fd0>
build ['target_net_params0', 'variables']
<tensorflow.python.ops.init_ops.RandomNormal object at 0x7f7d461c1fd0>
build ['target_net_params1', 'variables']
<tensorflow.python.ops.init_ops.RandomNormal object at 0x7f7d461c1fd0>
build ['target_net_params2', 'variables']


In [7]:
transition = np.hstack((s, a, r, s_))

In [4]:
s=[1,2,3,4]
a=6
r=12
s_=[2,4,5,6]

In [8]:
transition

array([ 1,  2,  3,  4,  6, 12,  2,  4,  5,  6])

In [37]:
a=np.array([[1,2,3,4,5,6],[4,5,6,7,10,11],[1,4,5,7,14,12],[4,5,7,8,44,89]])
y

array([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29],
       [ 30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
         43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
         56,  57,  58,  59],
       [ 60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,
         73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,
         86,  87,  88,  89],
       [ 90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
        103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
        116, 117, 118, 119],
       [120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132,
        133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145,
        146, 147, 148, 149],
       [150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162,
        163, 164, 165, 166, 167, 168, 169, 170, 171,

In [41]:
feature=6
y = np.arange((((feature-2)*3+2)*2+2)*10).reshape(10,(((feature-2)*3+2)*2+2))
for i in range(3):
    print('bs',i)
    index=list(np.arange(i*(feature-2),(i+1)*(feature-2)))
    index.append(3*(feature-2))
    index.append(3*(feature-2)+1)
    print(y[:,index])

bs 0
[[  0   1   2   3  12  13]
 [ 30  31  32  33  42  43]
 [ 60  61  62  63  72  73]
 [ 90  91  92  93 102 103]
 [120 121 122 123 132 133]
 [150 151 152 153 162 163]
 [180 181 182 183 192 193]
 [210 211 212 213 222 223]
 [240 241 242 243 252 253]
 [270 271 272 273 282 283]]
bs 1
[[  4   5   6   7  12  13]
 [ 34  35  36  37  42  43]
 [ 64  65  66  67  72  73]
 [ 94  95  96  97 102 103]
 [124 125 126 127 132 133]
 [154 155 156 157 162 163]
 [184 185 186 187 192 193]
 [214 215 216 217 222 223]
 [244 245 246 247 252 253]
 [274 275 276 277 282 283]]
bs 2
[[  8   9  10  11  12  13]
 [ 38  39  40  41  42  43]
 [ 68  69  70  71  72  73]
 [ 98  99 100 101 102 103]
 [128 129 130 131 132 133]
 [158 159 160 161 162 163]
 [188 189 190 191 192 193]
 [218 219 220 221 222 223]
 [248 249 250 251 252 253]
 [278 279 280 281 282 283]]


In [49]:
a=np.array([1,2,3,4])
a[[0,1]]

array([1, 2])