In [1]:
import or_gym
from gym import spaces
import numpy as np
from collections import Iterable
from or_gym.algos.rl_utils import *
from ray.rllib import agents
import time

# Find the NAN values!

Check with random actions to see if Nan or $\pm$ inf arise.

In [2]:
def check_values(x, msg=''):
    if not isinstance(x, Iterable):
        x = np.array([x])
    if type(x) is tuple:
        x = np.array(x).flatten()
    if any(np.isnan(s)):
        raise ValueError('{}\n{}'.format(msg, x))
    elif any(x==np.inf):
        raise ValueError('{}\n{}'.format(msg, x))
    elif any(x==-np.inf):
        raise ValueError('{}\n{}'.format(msg, x))
    else:
        return x

In [3]:
env_name = 'InvManagement-v0'
env = or_gym.make('InvManagement-v0')

In [4]:
N = 100
rewards = []
for i in range(N):
    R = []
    s = env.reset()
    done = False
    while done == False:
        check_values(s, 'Error found in state.')
        action = env.sample_action()
        check_values(action, 'Error found in action.')
#         action = np.zeros(action.shape)
        s, r, done, _ = env.step(action)
        R.append(r)
        check_values(r, 'Error found in reward')
        if done:
            rewards.append(max(R))

In [5]:
max(rewards)

37.189072499999995

# See if Nan's arise via Ray

In [6]:
ray.init(ignore_reinit_error=True)
trainer = agents.a3c.A3CTrainer(env=create_env(env_name),
    config={
        'env_config': {
            'mask': True
        },
    'model': {
        'fcnet_activation': 'elu',
        'fcnet_hiddens': [128, 128]
        }
    })

2020-05-06 17:54:56,051	INFO resource_spec.py:216 -- Starting Ray with 4.64 GiB memory available for workers and up to 2.34 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).
2020-05-06 17:54:56,634	INFO trainer.py:371 -- Tip: set 'eager': true or the --eager flag to enable TensorFlow eager execution
2020-05-06 17:54:56,651	INFO trainer.py:512 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2020-05-06 17:55:15,938	INFO trainable.py:102 -- _setup took 19.287 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [None]:
trainer = agents.ddpg.DDPGTrainer(env=create_env(env_name),
    config={
        'env_config': {
            'mask': True
        },
    'model': {
        'fcnet_activation': 'elu',
        'fcnet_hiddens': [128, 128]
        }
    })

training = True
n_episodes = 10000
batch = 0
rewards, eps, eps_total = [], [], []
t_start = time.time()
while training:
    t_batch = time.time()
    results = trainer.train()
    rewards.append(results['episode_reward_mean'])
    eps.append(results['episodes_this_iter'])
    eps_total.append(results['episodes_total'])
    batch += 1
    t_end = time.time()
    if sum(eps) >= n_episodes:
        training = False
        break
    if batch % 10 == 0:
        t = t_end - t_batch
        t_tot = t_end - t_start
        print("\rEpisode: {}\tMean Rewards: {:.1f}\tEpisodes/sec: {:.2f}\tTotal Time: {:.1f}s".format(
            eps_total[-1], rewards[-1], eps[-1]/t, t_tot), end="")

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Episode: 333	Mean Rewards: -347.0	Episodes/sec: 0.72	Total Time: 387.6s

In [None]:
trainer.compute_action(env.reset())

In [70]:
x = trainer.get_policy()

In [17]:
env2 = trainer.env_creator(config={
        'env_config': {
#             'env': 'InvManagement-v0',
            'env': 'Knapsack-v0',
            'mask': True
        },
    'model': {
        'fcnet_activation': 'elu',
        'fcnet_hiddens': [128, 128]
        }
    })

In [5]:
env.action_space

Box(3,)

In [12]:
env.action_space.contains(np.array([1, 1, 90]))

False

In [11]:
env.supply_capacity

array([100,  90,  80])

In [13]:
spaces.Box(low=np.zeros(3), high=env.supply_capacity, shape=(3,))

AssertionError: box requires scalar bounds. 

In [17]:
spaces.Tuple((
    spaces.Box(0, env.supply_capacity[0], shape=(1,)),
    spaces.Box(0, env.supply_capacity[1], shape=(1,))))

Tuple(Box(1,), Box(1,))

In [7]:
x = spaces.Tuple(tuple([spaces.Box(0, i, shape=(1,)) for i in env.supply_capacity]))

In [19]:
type(np.array(x.sample())) is type(np.array([]))

True

In [8]:
action = x.sample()

In [13]:
if type(action) != type(np.array([])):
    R = np.array(action).flatten().astype(int)

# get inventory at hand and pipeline inventory at beginning of the period
n = env.period
L = env.lead_time
I = env.I[n,:].copy() # inventory at start of period n
T = env.T[n,:].copy() # pipeline inventory at start of period n
m = env.num_stages # number of stages

c = env.supply_capacity # capacity
        
# available inventory at the m+1 stage (note: last stage has unlimited supply)
Im1 = np.append(I[1:], np.Inf) 

# place replenishment order
# R = action.astype(int)
R[R<0] = 0 # force non-negativity
if n>=1: # add backlogged replenishment orders to current request
    R = R + env.B[n-1,1:]
Rcopy = R.copy() # copy original replenishment quantity
R[R>=c] = c[R>=c] # enforce capacity constraint
R[R>=Im1] = Im1[R>=Im1] # enforce available inventory constraint
env.R[n,:] = R # store R[n]

In [11]:
R

array([[ 1],
       [51],
       [ 6]])