# Training RL Agent for given Number of Users for a single Server

In [1]:
import random
import pandas as pd
import numpy as np
np.set_printoptions(suppress=True)
import time

## Load Data for training
The dataset was generated using codes available in dataset_generator folder

In [2]:
datafile = 'dataset/dual_s_data.csv'
users_low = 100
users_res = 100
users_high = 500
#number_of_service = 2

#network latency
N_lat = 0.25

latency_threshold = 10 - N_lat #subtract latency due to network from total latency

1. states: RAM(MB), #Cores, BG WL(%), GPU(MB), S1:Users, S2:Users

2. More reward for the number closer to (s1,s2) with latency below given threshold

## RL Environment:

In [3]:
import numpy as np
import gym
from gym import spaces
from gym.utils import seeding


class yolosystem(gym.Env):
    metadata = {'render.modes': ['human']}
    
    def __init__(self, n_actions, filename):
        
        super(yolosystem, self).__init__()
        
        self.n_actions = n_actions #total number of action space after ranging [10, 20, 30 ...]
        self.action_space = spaces.Discrete(self.n_actions) #total number of users in the action space; starts with zero
        self.observation_space = spaces.Box(low=np.array([0,0,0,0,0,0]), high=np.array([11000]*6), shape=(6, ), dtype=np.int32) #<RAM, Core, Workload>
        self.seed()
        self.current_obs = np.array( [3000, 2, 40, 2, 100, 100] ) #current observation = <ram, cores, workload%>

        #Load dataset
        self.df = pd.read_csv(filename)
        # computer percentage of GPU usage from actual use
        self.df['workload_gpu'] = self.df['workload_gpu'].multiply(1/80).round(0).astype(int) #round gpu workload

        #get unique data in set
        self.ram = self.df.ram.unique()
        self.cores = self.df.cores.unique()
        self.workload_cpu = self.df.workload_cpu.unique()
        print(self.df) #print dataset
       
        

    def seed(self, seed=1010):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        assert self.action_space.contains(action) #action should be in action space
        state = self.current_obs
        done = True #Episodes ends after each action

        #compute latecy from the number of users
        reward = self.get_reward(state, action) #linear latency           
#         print(action, reward)
        self.current_obs = self.get_random_state() #go to a random state
        
#         print(self.current_obs)
        return self.current_obs, reward, done, {} #no-states, reward, episode-done, no-info

    def reset(self):
        self.current_obs = self.get_random_state()
        return self.current_obs #current state of the system with no load

    def render(self, mode='human', close=False):
        print(f"Current State:<{self.current_obs}>")
        
    
    #compute latency
    def get_reward(self, state, action):
        #change action to users
        
        u1 = action//5 + 1
        u2 = (action+1) - (u1-1)*5
        #sample time from dataframe
        gram = state[0]
        gcores = state[1]
        gwl_c = state[2]
        gwl_g = state[3]
        gs1 = u1*100
        gs2 = u2*100
#         print("user:", gs1, gs2, "act:", action)

        fetch_state = self.df.loc[ (self.df['ram'] == gram) & (self.df['cores']== gcores) & (self.df['workload_cpu']==gwl_c) & (self.df['workload_gpu']==gwl_g) & (self.df['users_yolo']==gs1) & (self.df['users_mnet']==gs2)]
                
        if fetch_state.empty:
            return -20 

        time1 = fetch_state.sample().iloc[0]['time_yolo'] #fetch time from the dataframe
        time2 = fetch_state.sample().iloc[0]['time_mnet']
        tm = max(time1, time2)
        #add total latencies due to network based on number of u1 and u2
        
        if (tm <= latency_threshold): 
            return  0.01*(gs1 - state[4]) +  0.01*(gs2 - state[5]) + u1 + u2 

        else:
            return -5 - u1 - u2     
        
    
    #get to some random state after taking an action
    def get_random_state(self):
        #generate state randomly
        gram = np.random.choice(self.ram, 1)[0]
        gcores = np.random.choice(self.cores, 1)[0]
        gwl_c = np.random.choice(self.workload_cpu, 1)[0]
        
        #fetch gamma for the state
        fetch_state = self.df.loc[ (self.df['ram'] == gram) & (self.df['cores']== gcores) & (self.df['workload_cpu']==gwl_c) ]
        gwl_g = fetch_state.sample().iloc[0]['workload_gpu'] #fetch workload randmoly
        
        gs1 = random.randrange(50, 550, 50)
        gs2 = random.randrange(50, 550, 50)
        
        return np.array( [gram, gcores, gwl_c, gwl_g, gs1, gs2] )

## Test RL Environment with Baseline3

In [4]:
from stable_baselines3.common.env_checker import check_env
env = yolosystem(25, datafile )
# If the environment don't follow the interface, an error will be thrown
check_env(env, warn=True)

        ram  cores  workload_cpu  workload_gpu  users_yolo  users_mnet  \
0      3000      2            40             2         100         100   
1      3000      2            40             2         100         200   
2      3000      2            40             2         100         300   
3      3000      2            40             2         100         400   
4      3000      2            40             2         100         500   
...     ...    ...           ...           ...         ...         ...   
5995  11000      5            60            10         500         100   
5996  11000      5            60            10         500         200   
5997  11000      5            60            10         500         300   
5998  11000      5            60            10         500         400   
5999  11000      5            60            10         500         500   

      time_yolo  time_mnet  
0     15.215310  17.846291  
1     15.477644  21.690610  
2     15.443997  27.3285

In [5]:
print(env.observation_space)
print(env.action_space)
print(env.action_space.sample())

Box(0, 11000, (6,), int32)
Discrete(25)
4


In [6]:
for i in range(25):
    t = env.get_reward([3000, 2, 40, 2, 500, 500], i)
    print(t)

-7
-8
-9
-10
-11
-8
-9
-10
-11
-12
-9
-10
-11
-12
-13
-10
-11
-12
-13
-14
-11
-12
-13
-14
-15


In [7]:
n_steps = 10
for step in range(n_steps):
    print("Step {}".format(step + 1))
    obs, reward, done, info = env.step(env.action_space.sample())
    print('reward=', reward)
    env.render()

Step 1
reward= -14
Current State:<[5000.    2.   40.    6.  450.  450.]>
Step 2
reward= -11
Current State:<[9000.    5.   40.   10.   50.  400.]>
Step 3
reward= -11
Current State:<[7000.    2.   40.   10.   50.  500.]>
Step 4
reward= -8
Current State:<[9000.    2.   40.   10.  350.  350.]>
Step 5
reward= -10
Current State:<[7000.    2.   60.    2.  350.  250.]>
Step 6
reward= -12
Current State:<[7000.    3.   40.    3.   50.  400.]>
Step 7
reward= -8
Current State:<[3000.    5.   50.    3.  100.  250.]>
Step 8
reward= -8
Current State:<[7000.    4.   60.    6.   50.  450.]>
Step 9
reward= -7
Current State:<[3000.    3.   40.    2.  500.  250.]>
Step 10
reward= -14
Current State:<[5000.    2.   40.    3.  250.  500.]>


## Tensorboard for Training Status

In [8]:
from stable_baselines3.common.monitor import Monitor
import os
# Create log dir
log_dir = './agent_tensorboard/'
os.makedirs(log_dir, exist_ok=True)

env = Monitor(env, log_dir)

In [9]:
from stable_baselines3 import DQN
from stable_baselines3.dqn import MlpPolicy
from stable_baselines3.common.vec_env import DummyVecEnv

# wrap it
env = DummyVecEnv([lambda: env])

## Train RL Model

In [None]:
model = DQN(MlpPolicy, env, verbose=0, tensorboard_log = log_dir, exploration_fraction=0.4, learning_starts=150000,  train_freq=30, target_update_interval=30000, exploration_final_eps=0.07)

In [None]:
begin = time.time()
model.learn(total_timesteps=500000) #reset_num_timesteps=False
end = time.time()
training_time = end-begin

In [None]:
training_time

In [None]:
# Save the agent
model.save(f"edge_agent_thres10")
# model.save(f"edge_agent_{latency_threshold}_lin")
# del model  # delete trained model to demonstrate loading

In [None]:
# Load the trained agent
# from stable_baselines3 import DQN
# model = DQN.load("edge_agent_20_lin")
#return action and state
#model.predict(np.array([2000, 4, 30]), deterministic=True)