# How to Use Deep Reinforcement Learning to Improve your Supply Chain

Full write up available [here](https://www.datahubbs.com/how-to-use-deep-reinforcement-learning-to-improve-your-supply-chain/).

Note Ray is not a dependency of OR-Gym. We want OR-Gym to be able to stand independently of other RL libraries as much as possible.

There have been breaking changes that have been introduced in later version of Ray which affect this environment in particular. To ensure no conflicts, please run:
- `pip install ray==1.0.0`
- `pip install ray[rllib]`
- `pip install ray[tune]`
- `pip install tensorflow==2.3.0`



In [None]:
import or_gym
from or_gym.utils import create_env

# import ray
# from ray.rllib.agents.ppo import PPOTrainer
# from ray import tune
from stable_baselines3 import PPO
from stable_baselines3.common.monitor import Monitor
from stable_baselines3.common.callbacks import StopTrainingOnMaxEpisodes

import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import gridspec

In [None]:

# Environment and RL Configuration Settings
env_name = 'InvManagement-v1'
# env_name = "Knapsack-v0"
env_config = {} # Change environment parameters here

env = Monitor(env=or_gym.make(env_name))
# env = gym.make("LunarLander-v2")

In [None]:
# Initialize Ray and Build Agent

callback_max_episodes = StopTrainingOnMaxEpisodes(
    max_episodes=5000, 
    verbose=1
)
model = PPO("MlpPolicy", env, verbose=1)
model.learn(np.inf, callback=callback_max_episodes)

In [None]:
rwd_arr = env.get_episode_rewards()

In [None]:
plt.style.use("ggplot")
fig, ax = plt.subplots()
ax.plot(rwd_arr)
ax.set(
    xlabel="episodes", 
    ylabel="reward", 
    title=f"PPO training on {env_name}",
)
plt.tight_layout()

In [None]:
# Unpack values from each iteration
rewards = np.hstack([i['hist_stats']['episode_reward'] 
    for i in results])
pol_loss = [
    i['info']['learner']['default_policy']['policy_loss'] 
    for i in results]
vf_loss = [
    i['info']['learner']['default_policy']['vf_loss'] 
    for i in results]
p = 100
mean_rewards = np.array([np.mean(rewards[i-p:i+1]) 
                if i >= p else np.mean(rewards[:i+1]) 
                for i, _ in enumerate(rewards)])
std_rewards = np.array([np.std(rewards[i-p:i+1])
               if i >= p else np.std(rewards[:i+1])
               for i, _ in enumerate(rewards)])
fig = plt.figure(constrained_layout=True, figsize=(20, 10))
gs = fig.add_gridspec(2, 4)
ax0 = fig.add_subplot(gs[:, :-2])
ax0.fill_between(np.arange(len(mean_rewards)), 
                 mean_rewards - std_rewards, 
                 mean_rewards + std_rewards, 
                 label='Standard Deviation', alpha=0.3)
ax0.plot(mean_rewards, label='Mean Rewards')
ax0.set_ylabel('Rewards')
ax0.set_xlabel('Episode')
ax0.set_title('Training Rewards')
ax0.legend()
ax1 = fig.add_subplot(gs[0, 2:])
ax1.plot(pol_loss)
ax1.set_ylabel('Loss')
ax1.set_xlabel('Iteration')
ax1.set_title('Policy Loss')
ax2 = fig.add_subplot(gs[1, 2:])
ax2.plot(vf_loss)
ax2.set_ylabel('Loss')
ax2.set_xlabel('Iteration')
ax2.set_title('Value Function Loss')
plt.show()

# Derivative Free Optimization

In [None]:
from scipy.optimize import minimize

In [None]:
def base_stock_policy(policy, env):
  '''
  Implements a re-order up-to policy. This means that for
  each node in the network, if the inventory at that node 
  falls below the level denoted by the policy, we will 
  re-order inventory to bring it to the policy level.
  
  For example, policy at a node is 10, current inventory
  is 5: the action is to order 5 units.
  '''
  assert len(policy) == len(env.init_inv), (
    'Policy should match number of nodes in network' + 
    '({}, {}).'.format(len(policy), len(env.init_inv)))
  
  # Get echelon inventory levels
  if env.period == 0:
    inv_ech = np.cumsum(env.I[env.period] +
      env.T[env.period])
  else:
    inv_ech = np.cumsum(env.I[env.period] +
      env.T[env.period] - env.B[env.period-1, :-1])
      
  # Get unconstrained actions
  unc_actions = policy - inv_ech
  unc_actions = np.where(unc_actions>0, unc_actions, 0)
  
  # Ensure that actions can be fulfilled by checking 
  # constraints
  inv_const = np.hstack([env.I[env.period, 1:], np.Inf])
  actions = np.minimum(env.c, np.minimum(unc_actions, inv_const))
  return actions

def dfo_func(policy, env, *args):
    '''
    Runs an episode based on current base-stock model 
    settings. This allows us to use our environment for the 
    DFO optimizer.
    '''
    env.reset() # Ensure env is fresh
    rewards = []
    done = False
    while not done:
        action = base_stock_policy(policy, env)
        state, reward, done, _ = env.step(action)
        rewards.append(reward)
        if done:
            break
            
    rewards = np.array(rewards)
    prob = env.demand_dist.pmf(env.D, **env.dist_param)
    
    # Return negative of expected profit
    return -1 / env.num_periods * np.sum(prob * rewards)
  
def optimize_inventory_policy(env_name, fun,
  init_policy=None, env_config={}, method='Powell'):
  
  env = or_gym.make(env_name, env_config=env_config)
  
  if init_policy is None:
      init_policy = np.ones(env.num_stages-1)
      
  # Optimize policy
  out = minimize(fun=fun, x0=init_policy, args=env, 
      method=method)
  policy = out.x.copy()
  
  # Policy must be positive integer
  policy = np.round(np.maximum(policy, 0), 0).astype(int)
  
  return policy, out

In [None]:
policy, out = optimize_inventory_policy('InvManagement-v1',
    dfo_func)
print("Re-order levels: {}".format(policy))
print("DFO Info:\n{}".format(out))

env = or_gym.make(env_name, env_config=env_config)
eps = 1000
rewards = []
for i in range(eps):
    env.reset()
    reward = 0
    while True:
        action = base_stock_policy(policy, env)
        s, r, done, _ = env.step(action)
        reward += r
        if done:
            rewards.append(reward)
            break