Example from [Grid Dynamics](https://blog.griddynamics.com/deep-reinforcement-learning-for-supply-chain-and-price-optimization).

In [1]:
import math 
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import pandas as pd
import collections
import json
from copy import deepcopy
from qbstyles import mpl_style
import gym
from gym import spaces
%matplotlib inline
mpl_style(dark=False)

In [19]:
x = np.array([np.nan, np.nan, 1])
env.action_space.high

array([20, 20, 20, 20], dtype=int16)

In [20]:
# Build Multi-Echelon Environment
class SupplyChainEnv(gym.Env):
    def __init__(self, config):
        
        self.warehouse_num = 3
        self.demand_history_len = 4
        self.min_obs, self.max_obs = -10000, 10000
        self.T = 26
        self.d_max = 5
        self.d_var = 2
        self.unit_price = 100
        self.unit_cost = 40
        
        self.storage_capacities = np.fromfunction(
            lambda x: 10*(x+1), (self.warehouse_num + 1,), dtype=int)
        self.storage_costs = np.fromfunction(
            lambda x: 2*(x+1), (self.warehouse_num + 1,), dtype=int)
        self.transportation_costs = np.fromfunction(
            lambda x: 5*(x+1), (self.warehouse_num,), dtype=int)
        self.penalty_unit_cost = self.unit_price
                
        self.reset()
        
        self.action_space = spaces.Box(low=0.0, high=20.0, 
            shape=(self.warehouse_num + 1,), dtype=np.int16)
        self.observation_space = spaces.Box(self.min_obs, self.max_obs, 
            shape=(self.state.shape), dtype=np.float32)
        
        self.state_indices = {
            'factory_stock': 0,
            'warehouse_stock': np.arange(1, self.warehouse_num + 1),
            'demand_history': np.arange(self.warehouse_num + 1, 
                self.warehouse_num *(1 + self.demand_history_len) + 1),
            'timestep': self.warehouse_num * (1 + self.demand_history_len) + 1
        }
        
    def _generate_demand(self, j, t):
        return np.round(self.d_max/2 \
            + self.d_max/2*np.sin(2*np.pi*(t + 2*j) / self.T*2) \
            + np.random.randint(0, self.d_var))
    
    def _initialize_demand(self):
        self.demand_history = collections.deque(maxlen=self.demand_history_len)
        _ = [self.demand_history.append(np.zeros(self.warehouse_num))
             for i in range(self.demand_history_len)]
        
    def generate_demand(self):
        return np.fromfunction(lambda x: 
            self._generate_demand(x+1, self.t), 
            (self.warehouse_num,), dtype=int)
    
    def init_state(self):
        self.t = 0
        self._initialize_demand()
        self.factory_stock = 0
        self.warehouse_stock = np.repeat(
            self.factory_stock, self.warehouse_num)
    
    def get_state(self):
        state = np.concatenate(
            ([self.factory_stock],
              self.warehouse_stock,
              np.hstack(self.demand_history),
             [self.t]))
        return self._check_bounds(state)

    def reset(self):
        self.init_state()
        self.state = self.get_state()
        return self.get_state()
    
    def get_stock_levels(self):
        return np.concatenate(
            ([self.factory_stock], self.warehouse_stock))
    
    def _check_bounds(self, state):
        # Ensure state is within observation bounds
        state = np.where(state<self.min_obs, self.min_obs, state)
        state = np.where(state>self.max_obs, self.max_obs, state)
        return state
    
    def _check_action(self, action):
        return np.array([self.action_space.high if np.isnan(i) else i 
            for i in action])

    def step(self, action, demand=None):
        if demand is None:
            demand = self.generate_demand()
        action = self._check_action(action)
        
        total_revenue = self.unit_price * demand.sum()
        total_production_cost = self.unit_cost * action[0]
        total_storage_cost = np.dot(self.storage_costs, 
            np.maximum(self.get_stock_levels(),
                np.zeros(self.warehouse_num + 1)))
        total_penalty_cost = -self.penalty_unit_cost * (np.sum(
            np.minimum(self.warehouse_stock, np.zeros(self.warehouse_num))) + \
            np.minimum(self.factory_stock, 0))
        total_transportation_cost = np.dot(self.transportation_costs,
            action[1:])
        reward = total_revenue - total_production_cost - total_storage_cost - \
            total_penalty_cost - total_transportation_cost
        if np.isnan(reward):
            raise ValueError("NaN found in reward.\nState = {}\nReward = {}".format(
                self.state, reward)
                + "\nRev = {}\tProd Cost = {}\tStorage Cost = {}".format(
                    total_revenue, total_production_cost, total_storage_cost)
                + "\nPenalty Cost = {}\tTransportation Cost = {}".format(
                    total_penalty_cost, total_transportation_cost)
                + "\nActions = {}".format(action))
        if np.isinf(reward):
            raise ValueError("Inf found in reward.\nState = {}\nReward = {}".format(
                self.state, reward)
                + "\nRev = {}\tProd Cost = {}\tStorage Cost = {}".format(
                    total_revenue, total_production_cost, total_storage_cost)
                + "\nPenalty Cost = {}\tTransportation Cost = {}".format(
                    total_penalty_cost, total_transportation_cost)
                + "\nActions = {}".format(action))
        # Update state
        self.factory_stock = min(self.factory_stock + action[0] - np.sum(action[1:]),
            self.storage_capacities[0])
        
        next_state_warehouse_stock = np.zeros(self.warehouse_stock.shape)
        for w in range(self.warehouse_num):
            next_state_warehouse_stock[w] = min(
                self.warehouse_stock[w] + action[w+1] - 
                demand[w], self.storage_capacities[w+1])
        self.warehouse_stock = next_state_warehouse_stock.copy()
        
        self.state = self.get_state()
        self.t += 1
        self.demand_history.append(demand)
        
        return self.state, reward, self.t == self.T - 1, {}

In [21]:
env = SupplyChainEnv({})

In [22]:
from sc_env import SimpleSupplyChain

env1 = SupplyChainEnv({})
env2 = SimpleSupplyChain({})

total_rewards = []
ep_matching_states = []
ep_matching_rewards = []
for i in range(100):
    # Ensure same outputs from different models
    env1.reset()
    env2.reset()
    states, rewards = [], []
    detailed_states, matching_rewards = [], []
    done = False
    count = 0
    while done == False:
        action = env2.action_space.sample()
        s2, r2, done2, _ = env2.step(action)
        demand = env2.supply_chain.demand_history[-1].copy()
        s1, r1, done1, _ = env1.step(action, demand)
        states.append(s1==s2)
        detailed_states.append(np.vstack([s1, s2]).T)
        matching_rewards.append(r1==r2)
        rewards.append(r2)
        count += 1
        if done2 == True:
            ep_matching_states.append(sum(sum(states))/(len(states)*len(states[0]))*100)
            ep_matching_rewards.append(sum(matching_rewards)/len(matching_rewards)*100)
            total_rewards.append(sum(rewards))
            break
            
# matching_states = [all(i) for i in states]
# print("Rewards match:\t {:.1f}%".format(sum(matching_rewards)/len(matching_rewards)*100))
# # print("States match:\t {:.1f}%".format(sum(matching_states)/len(matching_states)*100))
# print("States match:\t {:.1f}%".format(sum(sum(states))/(len(states) * len(states[0])) * 100))
# print("State Vector Matches (%):\n\t{}".format(sum(states)/len(states)*100))

print(np.mean(ep_matching_states))
print(np.mean(ep_matching_rewards))
print(np.mean(total_rewards))

100.0
100.0
-603536.8


In [6]:
import ray
from ray import tune
from ray.rllib.utils import try_import_tf

import ray.rllib.agents.ddpg as ddpg
from ray.rllib import agents
from ray.tune.logger import pretty_print

In [7]:
tf = try_import_tf()
    
# ray.shutdown()
ray.init(ignore_reinit_error=True)

def train_ddpg():
    config = ddpg.DEFAULT_CONFIG.copy()
    config["log_level"] = "WARN"
    config["actor_hiddens"] = [512, 512] 
    config["critic_hiddens"] = [512, 512]
    config["gamma"] = 0.95
    config["timesteps_per_iteration"] = 1000
    config["target_network_update_freq"] = 5
    config["buffer_size"] = 10000
    
    trainer = ddpg.DDPGTrainer(config=config, env=SupplyChainEnv)
    for i in range(10):
        result = trainer.train()
        print(pretty_print(result))
        checkpoint = trainer.save()
        print("Checkpoint saved at", checkpoint)
    return trainer, result

2020-05-12 09:16:27,149	INFO resource_spec.py:216 -- Starting Ray with 6.01 GiB memory available for workers and up to 3.01 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


In [23]:
trainer, result = train_ddpg()

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


custom_metrics: {}
date: 2020-05-11_18-29-25
done: false
episode_len_mean: 25.0
episode_reward_max: -545228.857820034
episode_reward_mean: -545228.857820034
episode_reward_min: -545228.857820034
episodes_this_iter: 1
episodes_total: 1
experiment_id: e72308bfadaf4ce5ad2335f68d334500
hostname: ubuntu
info:
  grad_time_ms: .nan
  learner: {}
  max_exploration: 1.0
  min_exploration: 1.0
  num_steps_sampled: 1000
  num_steps_trained: 0
  num_target_updates: 166
  opt_peak_throughput: 0.0
  opt_samples: .nan
  replay_time_ms: .nan
  sample_time_ms: 1.337
  update_time_ms: 0.001
iterations_since_restore: 1
node_ip: 192.168.0.11
num_healthy_workers: 0
off_policy_estimator: {}
perf:
  cpu_util_percent: 24.666666666666668
  ram_util_percent: 30.600000000000005
pid: 4110
policy_reward_max: {}
policy_reward_mean: {}
policy_reward_min: {}
sampler_perf:
  mean_env_wait_ms: 0.21058124500316577
  mean_inference_ms: 0.7920177071006386
  mean_processing_ms: 0.22159947024716006
time_since_restore: 1.639

custom_metrics: {}
date: 2020-05-11_18-33-41
done: false
episode_len_mean: 25.0
episode_reward_max: -545228.857820034
episode_reward_mean: -545228.857820034
episode_reward_min: -545228.857820034
episodes_this_iter: 0
episodes_total: 1
experiment_id: e72308bfadaf4ce5ad2335f68d334500
hostname: ubuntu
info:
  grad_time_ms: 21.717
  learner:
    default_policy:
      max_q: -5558.4970703125
      mean_q: -14923014.0
      min_q: -27734554.0
  max_exploration: 1.0
  min_exploration: 1.0
  num_steps_sampled: 7000
  num_steps_trained: 1408000
  num_target_updates: 1166
  opt_peak_throughput: 11787.819
  opt_samples: 256.0
  replay_time_ms: 18.314
  sample_time_ms: 2.567
  update_time_ms: 0.002
iterations_since_restore: 7
node_ip: 192.168.0.11
num_healthy_workers: 0
off_policy_estimator: {}
perf:
  cpu_util_percent: 20.441176470588236
  ram_util_percent: 35.76323529411763
pid: 4110
policy_reward_max: {}
policy_reward_mean: {}
policy_reward_min: {}
sampler_perf:
  mean_env_wait_ms: 0.2105812450

In [8]:
import matplotlib.pyplot as plt

In [13]:
def set_config(default_config, config_dict=None):
    config = deepcopy(default_config)
    if type(config_dict) == dict:
        for k in config.keys():
            if k in config_dict.keys():
                if type(config[k]) == dict:
                    for m in config[k].keys():
                        if m in config_dict.keys():
                            config[k][m] = config_dict[m]
                else:
                    config[k] = config_dict[k]
            else:
                continue
                
    return config

def train_agent(algo='a3c', iters=10, config_dict={}):
    if hasattr(agents, algo):
        agent = getattr(agents, algo)
        config = set_config(agent.DEFAULT_CONFIG, config_dict)
        trainer = getattr(agent, algo.upper() + 'Trainer')(config, env=SupplyChainEnv)
    else:
        raise AttributeError('No attribute {}'.format(algo))
    
    results = []
    for n in range(iters):
        result = trainer.train()
#         print(pretty_print(result))
        results.append(result)
        if (n + 1) % 10 == 0:
            print("Iter:\t{}\tMean Rewards:\t{:.1f}".format(n+1, result['episode_reward_mean']))
    
    return trainer, results

In [23]:
config_dict = {'actor_hiddens': [128, 128],
               'critic_hiddens': [128, 128],
               'timesteps_per_iteration': 1000,
               'buffer_size': 10000}

agent, results = train_agent('a3c', 200, config_dict)



Iter:	10	Mean Rewards:	-626198.1
[2m[36m(pid=19380)[0m Exception in thread Thread-1:
[2m[36m(pid=19380)[0m Traceback (most recent call last):
[2m[36m(pid=19380)[0m   File "/home/christian/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
[2m[36m(pid=19380)[0m     self.run()
[2m[36m(pid=19380)[0m   File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/rllib/evaluation/sampler.py", line 173, in run
[2m[36m(pid=19380)[0m     raise e
[2m[36m(pid=19380)[0m   File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/rllib/evaluation/sampler.py", line 170, in run
[2m[36m(pid=19380)[0m     self._run()
[2m[36m(pid=19380)[0m   File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/rllib/evaluation/sampler.py", line 193, in _run
[2m[36m(pid=19380)[0m     item = next(rollout_provider)
[2m[36m(pid=19380)[0m   File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/rllib/evaluation/sampler.py", line 340, in _env_run

RayTaskError(RuntimeError): [36mray::RolloutWorker.sample()[39m (pid=19380, ip=192.168.0.11)
  File "python/ray/_raylet.pyx", line 629, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 630, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 519, in ray._raylet.deserialize_args
ray.exceptions.RayTaskError: [36mray::RolloutWorker.sample()[39m (pid=19380, ip=192.168.0.11)
  File "python/ray/_raylet.pyx", line 636, in ray._raylet.execute_task
  File "python/ray/_raylet.pyx", line 619, in ray._raylet.execute_task.function_executor
  File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/rllib/evaluation/rollout_worker.py", line 471, in sample
    batches = [self.input_reader.next()]
  File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/rllib/evaluation/sampler.py", line 56, in next
    batches = [self.get_data()]
  File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/rllib/evaluation/sampler.py", line 201, in get_data
    raise RuntimeError("Sampling thread has died")
RuntimeError: Sampling thread has died

[2m[36m(pid=19379)[0m Exception in thread Thread-1:
[2m[36m(pid=19379)[0m Traceback (most recent call last):
[2m[36m(pid=19379)[0m   File "/home/christian/anaconda3/lib/python3.6/threading.py", line 916, in _bootstrap_inner
[2m[36m(pid=19379)[0m     self.run()
[2m[36m(pid=19379)[0m   File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/rllib/evaluation/sampler.py", line 173, in run
[2m[36m(pid=19379)[0m     raise e
[2m[36m(pid=19379)[0m   File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/rllib/evaluation/sampler.py", line 170, in run
[2m[36m(pid=19379)[0m     self._run()
[2m[36m(pid=19379)[0m   File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/rllib/evaluation/sampler.py", line 193, in _run
[2m[36m(pid=19379)[0m     item = next(rollout_provider)
[2m[36m(pid=19379)[0m   File "/home/christian/anaconda3/lib/python3.6/site-packages/ray/rllib/evaluation/sampler.py", line 340, in _env_runner
[2m[36m(pid=19379)[0m     

2020-05-12 11:29:32,736	ERROR worker.py:1056 -- listen_error_messages_raylet: Connection closed by server.
2020-05-12 11:29:32,739	ERROR import_thread.py:97 -- ImportThread: Connection closed by server.


[2m[33m(pid=raylet)[0m *** Aborted at 1589300972 (unix time) try "date -d @1589300972" if you are using GNU date ***
[2m[33m(pid=raylet)[0m PC: @                0x0 (unknown)
[2m[33m(pid=raylet)[0m *** SIGTERM (@0x3e8000056d8) received by PID 19364 (TID 0x7faf07d7b740) from PID 22232; stack trace: ***
[2m[33m(pid=raylet)[0m     @     0x7faf072f0390 (unknown)
[2m[33m(pid=raylet)[0m     @     0x7faf06bfe9f3 epoll_wait
[2m[33m(pid=raylet)[0m     @           0x418e1c boost::asio::detail::epoll_reactor::run()
[2m[33m(pid=raylet)[0m     @           0x4194b9 boost::asio::detail::scheduler::run()
[2m[33m(pid=raylet)[0m     @           0x409c5f main
[2m[33m(pid=raylet)[0m     @     0x7faf06b17830 __libc_start_main
[2m[33m(pid=raylet)[0m     @           0x40ed81 (unknown)


2020-05-12 11:29:32,748	ERROR worker.py:956 -- print_logs: Connection closed by server.


In [None]:
mean_rewards = [i['episode_reward_mean'] for i in results]
plt.figure(figsize=(12,8))
plt.plot(mean_rewards)
plt.show()

In [74]:
env = SupplyChainEnv({})
rewards = []
for i in range(10000):
    env.reset()
    R = 0
    done = False
    while done == False:
        s, r, done, _ = env.step(env.action_space.sample())
        R += r
        if done:
            rewards.append(R)
            
print(np.mean(rewards), np.min(rewards))

-608386.6926 -923634.0


In [75]:
np.max(rewards)

-306666.0