In [1]:
from IMP_CW_env import MESCEnv
import numpy as np 
from scipy.optimize import minimize

In [7]:
def optimize_inventory_policy(env, reward_fcn, policy_fcn, initial_policy=None, method='Powell'):

    if initial_policy is None:
        initial_policy = np.ones(env.n_actions*2,dtype=env.retailers[-1].I.dtype)
        initial_policy[1::2] = [retailer.order_quantity_limit for retailer in env.retailers] + [dc.order_quantity_limit for dc in env.DCs]
        initial_policy[0::2] = (1/3) * initial_policy[1::2] 

    results = minimize(fun = reward_fcn,
                       x0 = initial_policy,
                       args = (policy_fcn, env),
                       method = method,
                       )
    
    policy = results.x.copy()
    # Enforce policy is composed of non-integer positive numbers
    policy = np.clip(np.round(policy), a_min = 0, a_max = None)

    return policy, results

def reward_fcn(policy_param, policy_function, env):
    '''
    Runs an episode and computes the negative of the reward function.
    Reward function = expected profit
    '''
    rewards = []
    episode_terminated = False
    env.reset()
    if any(policy_param[0::2] >= policy_param[1::2]):
        return 1e8
    else: 
        while episode_terminated == False:
            action = policy_function(policy_param, env)
            state , reward, episode_terminated, _ = env.step(action)
            rewards.append(reward)

            return -1/env.n_periods * np.sum(env.prob_per_scenario*rewards)

def policy_function_sS(policy_param, env):
    # Check that given x0 is a valid policy
    assert len(policy_param) == (len(env.retailers)+len(env.DCs))*2, "(s,S) policy should match the number of entities*2. \nMismatch {} vs {}".format(len(policy_param), (len(env.retailers)+len(env.DCs))*2)
    
    # Compute the action (order quantity)
    action = np.zeros(env.n_actions, dtype=env.retailers[-1].I.dtype)
    order_quantity_limit = np.array([retailer.order_quantity_limit for retailer in env.retailers] + [dc.order_quantity_limit for dc in env.DCs])
    for i, state in enumerate(env.state[:-1]):
        if state <= policy_param[i*2]:
            action[i] = np.minimum(policy_param[i*2+1] - state , order_quantity_limit[i])
        else:
            action[i] = 0

    return action.astype(env.retailers[-1].I.dtype) 

In [8]:
n_retailers = 2
n_DCs = 1
n_suppliers = 1
supply_chain_structure = [n_retailers , n_DCs, n_suppliers]

In [9]:
inv_env = MESCEnv(supply_chain_structure)
optimal_policy, results = optimize_inventory_policy(inv_env, reward_fcn, policy_function_sS)
print("Re-order levels: ", optimal_policy)

Re-order levels:  [  7.  26.  10.  11.  38. 104.]
