In [1]:
import numpy as np
import ray
from ray import tune
import gym
from gym import spaces
from or_gym.utils.env_config import *

We want to enforce constraints on the model to restrict actions in certain cases. Here, I'll implement a simple knapsack environment to limit the algorithm from selecting items that cause it to exceed its limit. There will be three actions available to the algorithm.

- 0: end episode
- 1: accept item
- 2: reject item

If 0 is selected, the episode ends and the agent collects no additional reward. If 1 is selected, the agent packs that item and collects the reward. If 2 is selected, the agent rejects the item and moves to the next. 

If the parametric action selection works properly, the agent should never exceed the capacity of the knapsack and receive a large, negative reward.

In [37]:
class ParametricKnapsack(gym.Env):
    
    def __init__(self, *args, **kwargs):
        self.step_limit = 10
        self.item_values = np.random.randint(0, 10, self.step_limit)
        self.item_weights = np.random.randint(1, 5, self.step_limit)
        self.weight_capacity = 20
        self.action_space = spaces.Discrete(3)
        self.mask = True
        assign_env_config(self, kwargs)
        if self.mask:
            self.observation_space = spaces.Dict({
                "action_mask": spaces.Box(0, 1, shape=(3,)),
                "avail_actions": spaces.Box(0, 1, shape=(3,)),
                "knapsack": spaces.Box(0, self.weight_capacity, shape=(3,))
            })
        else:
            self.observation_space = spaces.Dict({
                "knapsack": spaces.Box(0, self.weight_capacity, shape=(3,))
            })
        
        self.reset()
        
    def reset(self):
        self.current_weight, self.current_step = 0, 0
        self.item_values = np.random.randint(0, 10, self.step_limit)
        self.item_weights = np.random.randint(1, 5, self.step_limit)
        if self.mask:
            self.state = {
                "action_mask": np.ones(3),
                "avail_actions": np.ones(3),
                "knapsack": np.array(
                    [self.current_weight, 
                     self.item_values[self.current_step], 
                     self.item_weights[self.current_step]])}
        else:
            self.state = {"knapsack": np.array(
                    [self.current_weight, 
                     self.item_values[self.current_step], 
                     self.item_weights[self.current_step]])}
        return self.state
    
    def step(self, action):
        self.current_weight = self.state["knapsack"][0]
        item_value = self.state["knapsack"][1]
        item_weight = self.state["knapsack"][2]
        done = False
        if action == 0:
            # End episode
            done = True
            reward = 0
        elif action == 1:
            # Accept item
            if self.current_weight + item_weight <= self.weight_capacity:
                self.current_weight += item_weight
                reward = item_value
                # End if capacity is met
                if self.current_weight == self.weight_capacity:
                    done = True
            else: # Overweight
                reward = -100
                done = True
        elif action == 2:
            # Reject item
            reward = 0
        
        self.current_step += 1
        if self.current_step >= self.step_limit:
            done = True
        self.update_state()
        return self.state, reward, done, {}
    
    def update_state(self):
        # Make action selection impossible if the knapsack would go over weight
        step = self.current_step if self.current_step < self.step_limit else self.step_limit-1
        knapsack = np.array([self.current_weight, 
                self.item_values[step], 
                self.item_weights[step]])
        if self.current_weight + knapsack[-1] > self.weight_capacity:
            action_mask = np.array([1, 0, 1])
        else:
            action_mask = np.ones(3)
        if self.mask:
            self.state = {
                "action_mask": action_mask,
                "avail_actions": np.ones(3),
                "knapsack": knapsack
            }
        else:
            self.state = {"knapsack": knapsack}

In [38]:
env = ParametricKnapsack(env_config={"mask": True})
rewards, steps = [], []

for i in range(100):
    env.reset()
    reward = 0
    done = False
    while done == False:
        action = env.action_space.sample()
        s, r, done, _ = env.step(action)
        reward += r
        if done:
            rewards.append(reward)
            steps.append(env.current_step)

In [81]:
s = env.reset()
s

{'action_mask': array([1., 1., 1.]),
 'avail_actions': array([1., 1., 1.]),
 'knapsack': array([0, 5, 4])}

In [90]:
s, r, done, _ = env.step(1)
s, r, done

({'action_mask': array([1, 0, 1]),
  'avail_actions': array([1., 1., 1.]),
  'knapsack': array([20,  8,  2])},
 2,
 True)

In [107]:
env.observation_space.sample()

OrderedDict([('knapsack',
              array([ 6.8426876, 11.413433 , 18.437813 ], dtype=float32))])

In [108]:
def create_env(config_env):
    return ParametricKnapsack()

tune.register_env("ParaKnapsack-v0", lambda config: create_env(config))

In [2]:
ray.init(ignore_reinit_error=True)

2020-04-21 17:17:34,878	INFO resource_spec.py:216 -- Starting Ray with 2.98 GiB memory available for workers and up to 1.5 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


{'node_ip_address': '192.168.0.11',
 'redis_address': '192.168.0.11:47587',
 'object_store_address': '/tmp/ray/session_2020-04-21_17-17-34_872332_30574/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-04-21_17-17-34_872332_30574/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2020-04-21_17-17-34_872332_30574'}

In [109]:
results = tune.run(
        "PPO",
        stop={"training_iteration": 10},
        config={
            "env": "ParaKnapsack-v0"
        })

2020-04-21 15:51:23,806	INFO ray_trial_executor.py:121 -- Trial PPO_ParaKnapsack-v0_dc90e01c: Setting up new remote runner.


Trial name,status,loc
PPO_ParaKnapsack-v0_dc90e01c,RUNNING,


[2m[36m(pid=27221)[0m 2020-04-21 15:51:25,909	INFO trainer.py:371 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=27221)[0m 2020-04-21 15:51:25,912	INFO trainer.py:512 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
Result for PPO_ParaKnapsack-v0_dc90e01c:
  custom_metrics: {}
  date: 2020-04-21_15-51-34
  done: false
  episode_len_mean: 2.9469026548672566
  episode_reward_max: 45.0
  episode_reward_mean: 4.153392330383481
  episode_reward_min: -80.0
  episodes_this_iter: 1356
  episodes_total: 1356
  experiment_id: b1c32acb32ee4cbb9559a4c4cbe3d7d5
  experiment_tag: '0'
  hostname: ubuntu
  info:
    grad_time_ms: 1871.544
    learner:
      default_policy:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 1.0702439546585083
        entropy_coeff: 0.0
        kl: 0.030406463891267776
        policy_loss: -0.08876977860927

Trial name,status,loc,iter,total time (s),timesteps,reward
PPO_ParaKnapsack-v0_dc90e01c,RUNNING,192.168.0.11:27221,1,5.81321,4000,4.15339


Result for PPO_ParaKnapsack-v0_dc90e01c:
  custom_metrics: {}
  date: 2020-04-21_15-51-40
  done: false
  episode_len_mean: 5.056746532156368
  episode_reward_max: 48.0
  episode_reward_mean: 11.94577553593947
  episode_reward_min: -83.0
  episodes_this_iter: 793
  episodes_total: 3173
  experiment_id: b1c32acb32ee4cbb9559a4c4cbe3d7d5
  experiment_tag: '0'
  hostname: ubuntu
  info:
    grad_time_ms: 1698.42
    learner:
      default_policy:
        cur_kl_coeff: 0.44999998807907104
        cur_lr: 4.999999873689376e-05
        entropy: 0.8514610528945923
        entropy_coeff: 0.0
        kl: 0.03969983384013176
        policy_loss: -0.08992589265108109
        total_loss: 99.16346740722656
        vf_explained_var: 0.10799845308065414
        vf_loss: 99.23554229736328
    load_time_ms: 15.45
    num_steps_sampled: 12000
    num_steps_trained: 11904
    sample_time_ms: 2227.719
    update_time_ms: 147.567
  iterations_since_restore: 3
  node_ip: 192.168.0.11
  num_healthy_workers: 2

Trial name,status,loc,iter,total time (s),timesteps,reward
PPO_ParaKnapsack-v0_dc90e01c,RUNNING,192.168.0.11:27221,3,12.3345,12000,11.9458


Result for PPO_ParaKnapsack-v0_dc90e01c:
  custom_metrics: {}
  date: 2020-04-21_15-51-47
  done: false
  episode_len_mean: 6.947916666666667
  episode_reward_max: 58.0
  episode_reward_mean: 21.432291666666668
  episode_reward_min: -76.0
  episodes_this_iter: 576
  episodes_total: 4426
  experiment_id: b1c32acb32ee4cbb9559a4c4cbe3d7d5
  experiment_tag: '0'
  hostname: ubuntu
  info:
    grad_time_ms: 1665.247
    learner:
      default_policy:
        cur_kl_coeff: 0.675000011920929
        cur_lr: 4.999999873689376e-05
        entropy: 0.6899988055229187
        entropy_coeff: 0.0
        kl: 0.012222486548125744
        policy_loss: -0.041779011487960815
        total_loss: 155.94522094726562
        vf_explained_var: 0.23325222730636597
        vf_loss: 155.97872924804688
    load_time_ms: 9.723
    num_steps_sampled: 20000
    num_steps_trained: 19840
    sample_time_ms: 1965.186
    update_time_ms: 89.762
  iterations_since_restore: 5
  node_ip: 192.168.0.11
  num_healthy_workers

Trial name,status,loc,iter,total time (s),timesteps,reward
PPO_ParaKnapsack-v0_dc90e01c,RUNNING,192.168.0.11:27221,5,18.7334,20000,21.4323


Result for PPO_ParaKnapsack-v0_dc90e01c:
  custom_metrics: {}
  date: 2020-04-21_15-51-53
  done: false
  episode_len_mean: 8.123481781376519
  episode_reward_max: 64.0
  episode_reward_mean: 29.251012145748987
  episode_reward_min: -61.0
  episodes_this_iter: 494
  episodes_total: 5443
  experiment_id: b1c32acb32ee4cbb9559a4c4cbe3d7d5
  experiment_tag: '0'
  hostname: ubuntu
  info:
    grad_time_ms: 1652.311
    learner:
      default_policy:
        cur_kl_coeff: 0.675000011920929
        cur_lr: 4.999999873689376e-05
        entropy: 0.5546323657035828
        entropy_coeff: 0.0
        kl: 0.006457101088017225
        policy_loss: -0.024905499070882797
        total_loss: 127.29461669921875
        vf_explained_var: 0.40242043137550354
        vf_loss: 127.31517028808594
    load_time_ms: 7.276
    num_steps_sampled: 28000
    num_steps_trained: 27776
    sample_time_ms: 1848.378
    update_time_ms: 64.859
  iterations_since_restore: 7
  node_ip: 192.168.0.11
  num_healthy_workers

Trial name,status,loc,iter,total time (s),timesteps,reward
PPO_ParaKnapsack-v0_dc90e01c,RUNNING,192.168.0.11:27221,7,25.112,28000,29.251


Result for PPO_ParaKnapsack-v0_dc90e01c:
  custom_metrics: {}
  date: 2020-04-21_15-51-59
  done: false
  episode_len_mean: 8.908482142857142
  episode_reward_max: 63.0
  episode_reward_mean: 31.316964285714285
  episode_reward_min: -68.0
  episodes_this_iter: 448
  episodes_total: 6364
  experiment_id: b1c32acb32ee4cbb9559a4c4cbe3d7d5
  experiment_tag: '0'
  hostname: ubuntu
  info:
    grad_time_ms: 1644.757
    learner:
      default_policy:
        cur_kl_coeff: 0.675000011920929
        cur_lr: 4.999999873689376e-05
        entropy: 0.447750985622406
        entropy_coeff: 0.0
        kl: 0.00445684464648366
        policy_loss: -0.015289338305592537
        total_loss: 172.96734619140625
        vf_explained_var: 0.37308651208877563
        vf_loss: 172.9796142578125
    load_time_ms: 5.901
    num_steps_sampled: 36000
    num_steps_trained: 35712
    sample_time_ms: 1776.504
    update_time_ms: 51.129
  iterations_since_restore: 9
  node_ip: 192.168.0.11
  num_healthy_workers: 2

Trial name,status,loc,iter,total time (s),timesteps,reward
PPO_ParaKnapsack-v0_dc90e01c,RUNNING,192.168.0.11:27221,9,31.4213,36000,31.317


Result for PPO_ParaKnapsack-v0_dc90e01c:
  custom_metrics: {}
  date: 2020-04-21_15-52-02
  done: true
  episode_len_mean: 9.246543778801843
  episode_reward_max: 64.0
  episode_reward_mean: 34.085253456221196
  episode_reward_min: -70.0
  episodes_this_iter: 434
  episodes_total: 6798
  experiment_id: b1c32acb32ee4cbb9559a4c4cbe3d7d5
  experiment_tag: '0'
  hostname: ubuntu
  info:
    grad_time_ms: 1642.482
    learner:
      default_policy:
        cur_kl_coeff: 0.3375000059604645
        cur_lr: 4.999999873689376e-05
        entropy: 0.38688457012176514
        entropy_coeff: 0.0
        kl: 0.008526529185473919
        policy_loss: -0.019615719094872475
        total_loss: 90.95699310302734
        vf_explained_var: 0.5279351472854614
        vf_loss: 90.97373962402344
    load_time_ms: 5.421
    num_steps_sampled: 40000
    num_steps_trained: 39680
    sample_time_ms: 1750.981
    update_time_ms: 46.346
  iterations_since_restore: 10
  node_ip: 192.168.0.11
  num_healthy_workers:

Trial name,status,loc,iter,total time (s),timesteps,reward
PPO_ParaKnapsack-v0_dc90e01c,TERMINATED,,10,34.5759,40000,34.0853


2020-04-21 15:52:02,933	INFO tune.py:334 -- Returning an analysis object by default. You can call `analysis.trials` to retrieve a list of trials. This message will be removed in future versions of Tune.


In [110]:
df = results.dataframe()
df.head()

Unnamed: 0,episode_reward_max,episode_reward_min,episode_reward_mean,episode_len_mean,episodes_this_iter,timesteps_this_iter,done,timesteps_total,episodes_total,training_iteration,...,info/learner/default_policy/cur_lr,info/learner/default_policy/total_loss,info/learner/default_policy/policy_loss,info/learner/default_policy/vf_loss,info/learner/default_policy/vf_explained_var,info/learner/default_policy/kl,info/learner/default_policy/entropy,info/learner/default_policy/entropy_coeff,config/env,logdir
0,64.0,-70.0,34.085253,9.246544,434,4000,True,40000,6798,10,...,5e-05,90.95699,-0.019616,90.97374,0.527935,0.008527,0.386885,0.0,ParaKnapsack-v0,/home/christian/ray_results/PPO/PPO_ParaKnapsa...


In [2]:
import or_gym
from or_gym.algos.rl_utils import *

In [3]:
env = or_gym.make('VMPacking-v0')

In [4]:
register_env('VMPacking-v0')

In [5]:
results = tune.run(
    "PPO",
    stop={"training_iteration": 10},
    config={
        "env": "VMPacking-v0",
        "env_config": {"mask": True}
    },
    reuse_actors=True)

2020-04-21 17:18:35,677	INFO resource_spec.py:216 -- Starting Ray with 2.88 GiB memory available for workers and up to 1.46 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-04-21 17:18:36,236	INFO ray_trial_executor.py:121 -- Trial PPO_VMPacking-v0_0b54171e: Setting up new remote runner.


Trial name,status,loc
PPO_VMPacking-v0_0b54171e,RUNNING,


[2m[36m(pid=30881)[0m 2020-04-21 17:18:39,333	INFO trainer.py:371 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=30881)[0m 2020-04-21 17:18:39,335	INFO trainer.py:512 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.


2020-04-21 17:18:43,226	ERROR trial_runner.py:482 -- Error processing event.
Traceback (most recent call last):
  File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/tune/trial_runner.py", line 426, in _process_trial
    result = self.trial_executor.fetch_result(trial)
  File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/tune/ray_trial_executor.py", line 378, in fetch_result
    result = ray.get(trial_future[0], DEFAULT_GET_TIMEOUT)
  File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/worker.py", line 1457, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AssertionError): [36mray::PPO.train()[39m (pid=30881, ip=192.168.0.11)
  File "python/ray/_raylet.pyx", line 636, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 619, in ray._raylet.execute_task.function_executor
  File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/rllib/agents/trainer.py", line 444, in train
    raise e
  File "/home/christ

Trial name,status,loc
PPO_VMPacking-v0_0b54171e,ERROR,

Trial name,# failures,error file
PPO_VMPacking-v0_0b54171e,1,/home/christian/ray_results/PPO/PPO_VMPacking-v0_0b54171e_2020-04-21_17-18-36glxpfvz9/error.txt


Trial name,status,loc
PPO_VMPacking-v0_0b54171e,ERROR,

Trial name,# failures,error file
PPO_VMPacking-v0_0b54171e,1,/home/christian/ray_results/PPO/PPO_VMPacking-v0_0b54171e_2020-04-21_17-18-36glxpfvz9/error.txt


TuneError: ('Trials did not complete', [PPO_VMPacking-v0_0b54171e])

In [97]:
df = results.dataframe()
df.head()

Unnamed: 0,episode_reward_max,episode_reward_min,episode_reward_mean,episode_len_mean,episodes_this_iter,timesteps_this_iter,done,timesteps_total,episodes_total,training_iteration,...,info/learner/default_policy/total_loss,info/learner/default_policy/policy_loss,info/learner/default_policy/vf_loss,info/learner/default_policy/vf_explained_var,info/learner/default_policy/kl,info/learner/default_policy/entropy,info/learner/default_policy/entropy_coeff,config/env,config/env_config,logdir
0,-10003.661022,-12127.070317,-10395.543755,22.587571,177,4000,True,40000,1671,10,...,75354720.0,-0.077681,75354720.0,-6.537284e-08,0.016969,3.742035,0.0,VMPacking-v0,{'mask': True},/home/christian/ray_results/PPO/PPO_VMPacking-...


In [96]:
class VMPackingEnv(gym.Env):
    
    def __init__(self, *args, **kwargs):
        self.cup_capacity = 1
        self.mem_capacity = 1
        self.t_interval = 20
        self.tol = 1e-5
#         self.step_limit = int(60 * 24 / self.t_interval)
        self.step_limit = 15
        self.n_pms = 50
        self.load_idx = np.array([1, 2])
        self.seed = 0
        self.mask = True
        assign_env_config(self, kwargs)
        self.action_space = spaces.Discrete(self.n_pms)
        if self.mask:
            self.observation_space = spaces.Dict({
                "action_mask": spaces.Box(0, 1, shape=(self.n_pms,)),
                "avail_actions": spaces.Box(0, 1, shape=(self.n_pms,)),
#                 "avail_actions": spaces.Discrete(self.n_pms),
                "data_center": spaces.Box(0, 1, shape=(self.n_pms+1, 3))
            })
        else:
            self.observation_space = spaces.Dict({
                "data_center": spaces.Box(0, 1, shape=(self.n_pms+1, 3))
            })
        self.reset()
        
    def reset(self):
        self.demand = self.generate_demand()
        self.current_step = 0
        if self.mask:
            self.state = {
                "action_mask": np.ones(self.n_pms),
                "avail_actions": np.ones(self.n_pms),
                "data_center": np.vstack([
                    np.zeros((self.n_pms, 3)),
                    self.demand[self.current_step]])
            }
        else:
            self.state = {
                "data_center": np.vstack([
                    np.zeros((self.n_pms, 3)),
                    self.demand[self.current_step]])
            }
        self.assignment = {}
        return self.state
    
    def step(self, action):
        done = False
        pm_state = self.state["data_center"][:-1]
        demand = self.state["data_center"][-1, 1:]
        
        if action < 0 or action >= self.n_pms:
            raise ValueError("Invalid action: {}".format(action))
            
        elif any(pm_state[action, 1:] + demand > 1 + self.tol):
            # Demand doesn't fit into PM
            reward = -10000
            done = True
        else:
            if pm_state[action, 0] == 0:
                # Open PM if closed
                pm_state[action, 0] = 1
            pm_state[action, self.load_idx] += demand
            reward = np.sum(pm_state[:, 0] * (pm_state[:,1:].sum(axis=1) - 2))
            self.assignment[self.current_step] = action
            
        self.current_step += 1
        if self.current_step >= self.step_limit:
            done = True
        self.update_state(pm_state)
        return self.state, reward, done, {}
    
    def update_state(self, pm_state):
        # Make action selection impossible if the PM would exceed capacity
        step = self.current_step if self.current_step < self.step_limit else self.step_limit-1
        data_center = np.vstack([pm_state, self.demand[step]])
        data_center = np.where(data_center>1,1,data_center) # Fix rounding errors
        self.state["data_center"] = data_center
        if self.mask:
            action_mask = (pm_state[:, 1:] + self.demand[step, 1:]) <= 1
            self.state["action_mask"] = (action_mask.sum(axis=1)==2).astype(int)
                    
    def generate_demand(self):
        cpu_demand = np.random.uniform(0, 1, size=self.step_limit)
        mem_demand = np.random.uniform(0, 1, size=self.step_limit)
        return np.vstack([np.zeros(self.step_limit), cpu_demand, mem_demand]).T

In [97]:
def create_env(config_env):
    return VMPackingEnv()

tune.register_env("VMPacking-v2", lambda config: create_env(config))

In [98]:
results = tune.run(
        "PPO",
        stop={"training_iteration": 1},
        config={
            "env": "VMPacking-v2"
        })

2020-04-22 09:07:07,426	INFO ray_trial_executor.py:121 -- Trial PPO_VMPacking-v2_8d0c9586: Setting up new remote runner.


Trial name,status,loc
PPO_VMPacking-v2_8d0c9586,RUNNING,


[2m[36m(pid=23676)[0m 2020-04-22 09:07:09,427	INFO trainer.py:371 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
[2m[36m(pid=23676)[0m 2020-04-22 09:07:09,429	INFO trainer.py:512 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
Result for PPO_VMPacking-v2_8d0c9586:
  custom_metrics: {}
  date: 2020-04-22_09-07-18
  done: true
  episode_len_mean: 9.825552825552826
  episode_reward_max: -73.1885195576232
  episode_reward_mean: -8601.61104317446
  episode_reward_min: -10121.984126281812
  episodes_this_iter: 407
  episodes_total: 407
  experiment_id: a5ce0042a5184bbfae50ea342a45191a
  experiment_tag: '0'
  hostname: ubuntu
  info:
    grad_time_ms: 2551.644
    learner:
      default_policy:
        cur_kl_coeff: 0.20000000298023224
        cur_lr: 4.999999873689376e-05
        entropy: 3.8979239463806152
        entropy_coeff: 0.0
        kl: 0.014262373559176922
        policy_los

Trial name,status,loc,iter,total time (s),timesteps,reward
PPO_VMPacking-v2_8d0c9586,TERMINATED,,1,6.46594,4000,-8601.61




Trial name,status,loc,iter,total time (s),timesteps,reward
PPO_VMPacking-v2_8d0c9586,TERMINATED,,1,6.46594,4000,-8601.61


2020-04-22 09:07:18,486	INFO tune.py:334 -- Returning an analysis object by default. You can call `analysis.trials` to retrieve a list of trials. This message will be removed in future versions of Tune.


In [99]:
df = results.dataframe()
df.head()

Unnamed: 0,episode_reward_max,episode_reward_min,episode_reward_mean,episode_len_mean,episodes_this_iter,timesteps_this_iter,done,timesteps_total,episodes_total,training_iteration,...,info/learner/default_policy/cur_lr,info/learner/default_policy/total_loss,info/learner/default_policy/policy_loss,info/learner/default_policy/vf_loss,info/learner/default_policy/vf_explained_var,info/learner/default_policy/kl,info/learner/default_policy/entropy,info/learner/default_policy/entropy_coeff,config/env,logdir
0,-73.18852,-10121.984126,-8601.611043,9.825553,407,4000,True,4000,407,1,...,5e-05,70013250.0,-0.054377,70013250.0,-3.268642e-08,0.014262,3.897924,0.0,VMPacking-v2,/home/christian/ray_results/PPO/PPO_VMPacking-...


In [5]:
from or_gym.algos.vm_packing.heuristics import *

In [89]:
env = VMPackingEnv()

In [91]:
env.observation_space.sample()['avail_actions']

32

In [8]:
def first_fit_heuristic(env):
    # assert env.spec.id == ('VMPacking-v0' or 'VMPacking-v1'), \
        # '{} received. Heuristic designed for VMPacking-v0/v1.'.format(env.spec.id)

    state = env.reset()
    done = False
    rewards, actions = [], []
    while done == False:
        action = first_fit_step(state)
        state, reward, done, _ = env.step(action)
        actions.append(action)
        rewards.append(reward)

    return actions, rewards

def first_fit_step(state):
    s_bins = state['data_center'][:-1]
    s_item = state['data_center'][-1, 1:]
    action = None
    open_bins = np.where(s_bins[:,0]==1)[0]
    if len(open_bins) < 1:
        # Open first bin for item
        action = 0
    else:
        # Check each bin until one is found to fit the item
        for b in open_bins:
            if all(s_bins[b, [1, 2]] + s_item <= 1):
                action = b
        if action is None:
            action = np.max(open_bins) + 1
    return action

In [16]:
for i in range(10000):
    env.reset()
    actions, rewards = first_fit_heuristic(env)
    tot_rewards = sum(rewards)
    if tot_rewards > 0:
        print('Positive rewards found')
        break

In [72]:
env = VMPackingEnv()
N = 1000
count = 0
while count < N:
    env.reset()
    p_found = False
    done = False
    rewards = []
    while done == False:
        action = env.action_space.sample()
        state, r, done, _ = env.step(action)
        rewards.append(r)
        if r > 0:
            print('Positive rewards found')
            p_found = True
            break
    count += 1
    if p_found:
        break

In [57]:
done

False

In [47]:
r

1.245679701610709

In [51]:
pm_state = state['data_center'][:-1]

In [52]:
np.sum(pm_state[:, 0] * (pm_state[:,1:].sum(axis=1) - 2))

0.0

In [49]:
env.assignment

{0: 6, 1: 6}

In [50]:
env.state

{'action_mask': array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1]),
 'avail_actions': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
 'data_center': array([[0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        ],
        [1.        , 1.        , 1.        ],
        [0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        ],
        [0.        , 0.        , 0.        ],
        [0.        , 0.98100702, 0.64886836]])}

In [53]:
env.demand[1]

array([0.        , 0.8077121 , 0.78294696])

In [54]:
env.demand[0]

array([0.        , 0.76834515, 0.8866755 ])

In [55]:
demand = state["data_center"][-1, 1:]

In [56]:
demand

array([0.98100702, 0.64886836])

In [58]:
action

6

In [69]:
pm_state = env.state["data_center"][:-1]
demand = env.state["data_center"][-1, 1:]

if action < 0 or action >= env.n_pms:
    raise ValueError("Invalid action: {}".format(action))

elif any(pm_state[action, 1:] + demand > 1):
    # Demand doesn't fit into PM
    reward = -10000
    done = True
else:
    print('Pack')

In [70]:
reward

-10000

In [68]:
any(x)

True