# Simulation for Integreted networking with DQN
本文对Integreted networking进行仿真，首先定义使用的一些基本参数，基本都取自原paper simulation一节,由于是对单个用户进行仿真，整体的一些限制条件也许难以考虑到。

In [None]:
import matplotlib 
import matplotlib.pyplot as plt
import numpy as np
import time 
import sys
import tensorflow as tf
from tqdm import tqdm
np.random.seed(1)
tf.set_random_seed(1)
%matplotlib inline

In [None]:
K = 5 # The number of base stations
M = 5 # The number of MEC servers
C = 5 # The number of caches

In [None]:
channel_state = 0.3 # Transition probability of channels
cache_state = 0.4 # Transition probability of cache state
computation_state = [[0.6,0.3,0.1],[0.3,0.1,0.6],[0.1,0.6,0.3]] # Transition probability of computation capbility: Good, Medium, Bad

In [None]:
bandwith = 5 # MHz
access_charge = 10 # units/Mbps
spectrum_charge = 2 # units/MHz
mec_charge = 1 # units/Mbps
CPU_cycle = 100 # Mcycles
content_size = 1 # Mbits
spectrum_efficiency = [1,3] # bps/Hz
cache_charge = 3 # units/Mbps
computation_capbility = [4,8,12] # GHz
energy_pay = 100 # units/J
energy_consumption = 1 # W/GHz, energy consumption for performing one CPU cycle
effect_factor = 0.5
cache_pay = 3 # units/Mbits

# state
state是每个基站的状态，[K(每个基站的snr), M(每个基站的计算能力(4,8,12)), C(每个基站[1,2,..I]均为0或1，表示对应文件相应的缓存)]

snr的计算分为L个层级，对应马尔可夫过程，我觉得好像就是spectrum efficiency的两种变化

# action
action包括连哪个基站，(1,2,...,k){0,1}
卸载给哪个MEC， (1,2,...,m){0,1}
是否有缓存 (1,2,...,c){0,1}

In [None]:
state = np.array([1] * K + [4] * M + [0] * C)
n_feature =  K + M + C

# 行动
每一步的action会改变当前的state，也会带来一个即时的reward,之后的action会有一个概率随机行事，否则将会以史为鉴。state同时也会根据所提供的马尔科夫概率进行转变

In [None]:
def move(action):
    origin = state.copy()
    
    # Reward brought by the action
    reward = state[action[0]] * access_charge * bandwith * (1 - effect_factor * action[0]) - spectrum_charge * bandwith
    + content_size * state[action[1] + K] * mec_charge / CPU_cycle \
    + cache_charge * bandwith * state[action[0]] * state[action[2] + K + M] - cache_pay * content_size
    
    # In this paper, the transition of states is described by Markov process, which is so unreasonable in the real world.
    for i in range(K):
        temp = np.random.uniform()
        if temp < channel_state and state[i] == 1:
            state[i] = 3
        elif temp < channel_state and state[i] == 3:
            state[i] = 1
    for i in range(K, K+M):
        temp = np.random.uniform()
        if state[i] == 4:
            if temp < 0.1:
                state[i] = 12
            elif temp<0.7:
                state[i] = 8
        if state[i] == 8:
            if temp < 0.3:
                state[i] = 12
            elif temp < 0.9:
                state[i] = 4
        if state[i] == 12:
            if temp < 0.1:
                state[i] = 4
            elif temp < 0.4:
                state[i] = 8
    for i in range(K+M, K+M+C):
        temp = np.random.uniform()
        if state[i] == 0 and temp < cache_state:
            state[i] = 1
        elif state[i] == 1 and temp < cache_state:
            state[i] = 0
    return origin, reward

# Deep Q Network
将完整的DQN封装成一个类以方便日后调用和一些参数的调试。我现在仍然认为，基于DQN的深度增强学习就是用DNN对一个动态规划进行拟合，但类似于本文中这种多变的马尔科夫支持的环境的具体效果，并未可知。

In [None]:
class DeepQNetwork:
    def __init__(
            self,
            n_actions,
            n_features,
            learning_rate=0.01,
            reward_decay=0.9,
            e_greedy=0.9,
            replace_target_iter=300,
            memory_size=500,
            batch_size=32,
            e_greedy_increment=None,
            output_graph=False,
    ):
        self.n_actions = n_actions
        self.n_features = n_features
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max

        # total learning step
        self.learn_step_counter = 0

        # initialize zero memory [s, a, r, s_]
        self.memory = np.zeros((self.memory_size, n_features * 2 + 2))

        # consist of [target_net, evaluate_net]
        self._build_net()

        t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net')
        e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net')

        with tf.variable_scope('soft_replacement'):
            self.target_replace_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]

        self.sess = tf.Session()

        if output_graph:
            # $ tensorboard --logdir=logs
            tf.summary.FileWriter("logs/", self.sess.graph)

        self.sess.run(tf.global_variables_initializer())
        self.cost_his = [] #his means histroy

    def _build_net(self):
        # ------------------ all inputs ------------------------
        self.s = tf.placeholder(tf.float32, [None, self.n_features], name='s')  # input State
        self.s_ = tf.placeholder(tf.float32, [None, self.n_features], name='s_')  # input Next State
        self.r = tf.placeholder(tf.float32, [None, ], name='r')  # input Reward
        self.a = tf.placeholder(tf.int32, [None, ], name='a')  # input Action

        w_initializer, b_initializer = tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)

        # ------------------ build evaluate_net ------------------
        with tf.variable_scope('eval_net'):
            e1 = tf.layers.dense(self.s, 20, tf.nn.relu, kernel_initializer=w_initializer,
                                 bias_initializer=b_initializer, name='e1')
            self.q_eval = tf.layers.dense(e1, self.n_actions, kernel_initializer=w_initializer,
                                          bias_initializer=b_initializer, name='q')

        # ------------------ build target_net ------------------
        with tf.variable_scope('target_net'):
            t1 = tf.layers.dense(self.s_, 20, tf.nn.relu, kernel_initializer=w_initializer,
                                 bias_initializer=b_initializer, name='t1')
            self.q_next = tf.layers.dense(t1, self.n_actions, kernel_initializer=w_initializer,
                                          bias_initializer=b_initializer, name='t2')

        with tf.variable_scope('q_target'):
            q_target = self.r + self.gamma * tf.reduce_max(self.q_next, axis=1, name='Qmax_s_')    # shape=(None, )
            self.q_target = tf.stop_gradient(q_target)
            
        with tf.variable_scope('q_eval'):
            a_indices = tf.stack([tf.range(tf.shape(self.a)[0], dtype=tf.int32), self.a], axis=1)
            self.q_eval_wrt_a = tf.gather_nd(params=self.q_eval, indices=a_indices)    # shape=(None, )
            
        with tf.variable_scope('loss'):
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval_wrt_a, name='TD_error'))
            
        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)

    def store_transition(self, s, a, r, s_):
        if not hasattr(self, 'memory_counter'):
            self.memory_counter = 0
        transition = np.hstack((s, [a, r], s_))
        # replace the old memory with new memory
        index = self.memory_counter % self.memory_size
        self.memory[index, :] = transition
        self.memory_counter += 1

    def choose_action(self, observation):
        # to have batch dimension when feed into tf placeholder
        observation = observation[np.newaxis, :]

        if np.random.uniform() < self.epsilon:
            # forward feed the observation and get q value for every actions
            actions_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
            action_number = np.argmax(actions_value)
            if action_number < C:
                k = 0
                m = 0
                c = action_number
            elif action_number < C*M:
                k = 0
                m = int(action_number / C)
                c = action_number % C
            else:
                k = int(action_number / (C * M))
                mc = action_number % (C * M)
                m = int(mc / C)
                c = mc % C
            action = np.array([k, m, c])
        else:
            action = np.array([np.random.randint(0, K),np.random.randint(0,M),np.random.randint(0,C)])
        return action

    def learn(self):
        # check to replace target parameters
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.target_replace_op)
            print('\ntarget_params_replaced\n')

        # sample batch memory from all memory
        if self.memory_counter > self.memory_size:
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]

        _, cost = self.sess.run(
            [self._train_op, self.loss],
            feed_dict={
                self.s: batch_memory[:, :self.n_features],
                self.a: batch_memory[:, self.n_features],
                self.r: batch_memory[:, self.n_features + 1],
                self.s_: batch_memory[:, -self.n_features:],
            })

        self.cost_his.append(cost)

        # increasing epsilon
        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1

    def plot_cost(self):
        plt.plot(np.arange(len(self.cost_his)), self.cost_his)
        plt.ylabel('Cost')
        plt.xlabel('training steps')
        plt.show()

In [None]:
def plot_reward():
    plt.plot(np.arange(len(reward_his)), reward_his)
    plt.ylabel('Utility')
    plt.xlabel('training steps')
    plt.show()

# run()

In [None]:
def run(state,action):
    step = 0
    for _ in tqdm(range(30)):
        reset(state, action)
        while True:
            action = RL.choose_action(state)
            origin, reward = move(action)
            reward_his.append(reward)
            action_number = action[2] + action[1] * C + action[0] * (M *C)
            RL.store_transition(origin, action_number, reward, state)
            if(step > 200) and (step % 5 == 0):
                RL.learn()
            step += 1
            if step == 2000:
                break
    print('game over')

In [None]:
run(state,action)

In [None]:
plt0 = RL.plot_cost()
plt1 = RL.plot_reward()