In [None]:
from IMP_CW_env import MESCEnv
import numpy as np 
from scipy.optimize import minimize, basinhopping

from HeuristicPolicy_file import HeuristicPolicy, Optimizer
from utils import plot_reward_distribution

import matplotlib.pyplot as plt
%matplotlib inline

# **Training environment definition**

In [None]:
n_retailers = 2
n_DCs = 1
n_suppliers = 1
supply_chain_structure = [[n_retailers] , [n_DCs], n_suppliers]

# **Policy optimization**
### Instantiate objects

In [None]:
env_train = MESCEnv(supply_chain_structure, num_periods = 4*7)
sSpolicy = HeuristicPolicy()

### **Run optimization**

In [None]:
from scipy.optimize import Bounds
bounds = Bounds(np.zeros(6), [env_train.retailers[0].order_quantity_limit, # bounds for s_r1
                              env_train.retailers[0].order_quantity_limit, # bounds for S_r1
                              env_train.retailers[1].order_quantity_limit, # bounds for s_r2
                              env_train.retailers[1].order_quantity_limit, # bounds for S_r2
                              env_train.DCs[0].order_quantity_limit, # bounds for s_DC
                              env_train.DCs[0].order_quantity_limit]) # bounds for S_DC

In [20]:
optimizer = Optimizer(sSpolicy.objective_fcn, env_train)
options = None
results = sSpolicy.optimize_policy(env_train, optimizer.calculate_reward, method = 'Powell', bounds=bounds, callback = optimizer.callback, options = options)

301.5 301.5
1206.5 905.0
1587.5 381.0
2192.5 605.0
2575.5 383.0
2958.0 382.5
3348.5 390.5
3643.0 294.5
4547.5 904.5
5252.0 704.5
5859.5 607.5
6263.5 404.0
6284.5 21.0
6695.5 411.0
6977.0 281.5
7868.5 891.5
8561.5 693.0
9160.0 598.5
9557.0 397.0
9955.5 398.5
10355.5 400.0
10339.5 -16.0
11248.5 909.0
11931.0 682.5
12520.5 589.5
12911.5 391.0
13308.5 397.0
13710.5 402.0
501.0 501.0
1205.5 704.5
2011.5 806.0
2120.5 109.0
2633.5 513.0
2823.0 189.5
3412.0 589.0
3912.0 500.0
4620.0 708.0
5432.0 812.0
5838.5 406.5
6039.0 200.5
6242.5 203.5
6830.0 587.5
7323.5 493.5
8025.0 701.5
8835.5 810.5
9237.0 401.5
9741.0 504.0
9640.0 -101.0
10253.0 613.0
10746.5 493.5
11448.0 701.5
12253.5 805.5
12653.5 400.0
13147.5 494.0
13039.5 -108.0
13645.5 606.0
-4.0 -4.0
796.0 800.0
1403.0 607.0
2017.0 614.0
2537.0 520.0
2753.5 216.5
3277.0 523.5
3289.5 12.5
4085.5 796.0
4687.5 602.0
5296.5 609.0
5811.5 515.0
6022.5 211.0
6841.0 818.5
6539.0 -302.0
7356.0 817.0
7954.0 598.0
8559.0 605.0
9070.0 511.0
9277.5 207.5
1

KeyboardInterrupt: 

Get solution

In [None]:
optimizer.get_best_solution()
optimal_policy_sS = np.clip(np.floor(optimizer.best_x),a_min=0.,a_max=None).astype(int)

In [None]:
print(f'Converged: {results.success}')
print(f'Number of iterations: {optimizer.iter} \nBest objective function value: {optimizer.best_f:.2f}\nMaximum reward: {-1*optimizer.best_f:.2f}\nOptimal (s,S) policy: {optimal_policy_sS}')

Visualize reward evolution

In [None]:
optimizer.plot_learning_curve()

### Compare solution to Bayesian Optimization

In [None]:
from bayes_opt import BayesianOptimization
pbounds = {'s_r1': (0, int(env_train.retailers[0].order_quantity_limit)), 
           'S_r1': (0, int(env_train.retailers[0].order_quantity_limit)), 
           's_r2': (0, int(env_train.retailers[1].order_quantity_limit)), 
           'S_r2': (0, int(env_train.retailers[1].order_quantity_limit)), 
           's_DC': (0, int(env_train.DCs[0].order_quantity_limit)), 
           'S_DC': (0, int(env_train.DCs[0].order_quantity_limit))}

def objectiveBO(s_r1,S_r1,s_r2,S_r2,s_DC,S_DC):
    
    policy_param = np.array([s_r1,S_r1,s_r2,S_r2,s_DC,S_DC])
    
    num_runs = 3
    total_reward_list = []
    for _ in range(num_runs):
        total_reward = 0
        episode_terminated = False

        env_train.reset()

        if any(policy_param[0::2] >= policy_param[1::2]):
            return -1e8
        else:
            while episode_terminated == False:
                action = sSpolicy.policy_fcn(policy_param, env_train)
                state , reward, episode_terminated, _ = env_train.step(action)
                total_reward += reward
                
        total_reward_list.append(total_reward)
    return np.mean(total_reward_list)

optimizer = BayesianOptimization(
    f= objectiveBO,
    pbounds=pbounds,
    random_state=0,
)
optimizer.maximize(
    init_points = 25,
    n_iter=25
)

best_param_BO = np.clip(np.floor(list(optimizer.max['params'].values())),a_min=0.,a_max=None).astype(int)
print(f"BO optimization results\n- Best return: {optimizer.max['target']}\n- Best param: {best_param_BO}")

# **(s,S) Policy evaluation**

Note that here the policy is tested in the same environment where it was trained.

### Test dataset of demands
It will contain 100 samples for testing, each with `env_train.n_periods` days.

In [None]:
import pickle
# Open the file in binary mode
with open("test_demand_dataset.pickle", 'rb') as file:
    # Deserialize and retrieve the variable from the file
    test_demand_dataset = pickle.load(file)

### Policy evaluation
Set the optimal policy parameters before running the evaluation.

In [None]:
sSpolicy.policy_param = optimal_policy_sS

In [None]:
reward_list = sSpolicy.evaluate_policy(env_train, test_demand_dataset)
plot_reward_distribution(reward_list)