# 행동제약이 존재하는 재고관리문제에서 강화학습 에이전트의 조정방안 연구
DLCD (Decentralized Learning & Centralized Decision making) optimzation model  
예산 제약을 가진 공급망의 재고관리문제를 Q-Learning 및 이차계획법을 이용해 풀이

- Environment : supply chain (consisted of one supplier and two vendors)
- States: current inventory, current backlog
- Agents: vendors
- Action: outgoing order
- Reward: total supply chain cost (= inventory cost + backlog cost)

Warehouse Simulation 출처: https://github.com/AbhinavJhanwar/Reinforcement-Learning

### Preset and parameters

In [1]:
# import required packages
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import _pickle as pickle
import math
import time
np.random.seed(42)

In [2]:
# local directory
current_path = os.getcwd()
data_DIR = os.path.join(current_path, 'data')
file_DIR = os.path.join(current_path, 'output')


# data path
# named 'csv' for training and 'csv_test' for testing
csv_A = os.path.join(data_DIR, 'martA.csv')
csv_B = os.path.join(data_DIR, 'martB.csv')
csv_test_A = os.path.join(data_DIR, 'testA.csv')
csv_test_B = os.path.join(data_DIR, 'testB.csv')

In [3]:
# budget
# we are considering 2-echlon scm with one supplier and two vendors
# one supplier manages multiple vendors with single budget
beginning_budget = 96000*1
closing_budget = 0


# set flag for training or testing
train = True
# train = False


# set the number of episodes for training and testing
episodes = (50 if train else 30)


# create a folder for each budget we are experimenting with
output_DIR = os.path.join(file_DIR, 'dlcd_{}/'.format(beginning_budget))
if train:
    try:
        if not os.path.exists(output_DIR):
            print('Create folder:', output_DIR)
            os.makedirs(output_DIR)
    except OSError:
        print('Error: Creating Folder', output_DIR)

In [4]:
# item dicationary to change column names
item_dict = {'A':'orange', 'B':'egg', 'C':'hotdog', 'D': 'cheese', 'E':'pork'}


# set the items for each mart
# item name, inventory, backlog, incoming_delivery, item cost
items_A = [('orange', 400, 0, 10, 50),
           ('egg', 400, 0, 10, 60),
           ('hotdog', 400, 0, 10, 45)]

items_B = [('orange', 500, 0, 10, 50),
           ('egg', 500, 0, 10, 60),
           ('cheese', 500, 0, 10, 55),
           ('pork', 500, 0, 10, 65)]


# holding cost and backlog cost for each mart
# holding cost per unit is the cost incurred to store leftover inventory
# backlog cost per unit is the cost incurred by unsatisfied demands
hb_cost_A = [0.5, 1.0]
hb_cost_B = [0.5, 1.0]


# item cost
costs = {'orange': 50, 'egg': 60, 'hotdog': 45, 'cheese': 55, 'pork': 65}


# minimum cost of inventory
min_cost = {'orange': 0, 'egg': 0, 'hotdog': 0, 'cheese': 0, 'pork': 0}


# maximum inventory
max_inventory_A =  {'orange': 1000, 'egg': 1000, 'hotdog': 1000}
max_inventory_B = {'orange': 1000, 'egg': 1000, 'cheese': 1000, 'pork': 1000}


# all possible actions at any state of warehouse
# action means order quantity which can be purchased per item
all_possible_actions = {'orange': [0, 10, 20, 30, 40, 50, 60, 70, 80],
                        'egg': [0, 10, 20, 30, 40, 50, 60, 70, 80],
                        'hotdog': [0, 10, 20, 30, 40, 50, 60, 70, 80],
                        'cheese': [0, 10, 20, 30, 40, 50, 60, 70, 80],
                        'pork': [0, 10, 20, 30, 40, 50, 60, 70, 80]}


# number of levels to divide the total state
# into -n_bins (for backlog) to n_bins (for inventory)
n_bins = 100


# all possible states at any state of warehouse
all_states = list(range(-n_bins, n_bins+1))


# define delivery delay from upstream in weeks
time_delay = 1


# get random incoming delivery or not
# if False, incoming delivery is outgoing order
random_incoming_delivery = False


# total number of weeks to consider an episode is over
number_of_weeks = 24


# learning rate and discount factor
alpha, gamma = 0.4, 0.9

### Define functions and environment

In [5]:
class Vendor:
    def __init__(self, vendor_name, items, max_inventory):
        # initialize various parameters of items
        # [current week, next week]
        self.name = vendor_name
        self.item = []
        self.inventory = {}
        self.backlog = {}
        self.incoming_delivery = {}
        self.outgoing_delivery = {}
        self.incoming_order = {}
        self.outgoing_order = {}
        self.upstream_pending_delivery = {}
        self.cost = {}
        self.max_inventory = max_inventory
        
        for item_name, inventory, backlog, incoming_delivery, item_cost in items:
            self.item.append(item_name)
            self.inventory[item_name] = [inventory, -1]
            self.backlog[item_name] = [backlog, -1]
            self.incoming_delivery[item_name] = incoming_delivery
            self.outgoing_delivery[item_name] = -1
            self.incoming_order[item_name] = -1
            self.cost[item_name] = item_cost
            # considering upstream
            self.outgoing_order[item_name] = [10]*time_delay
            self.upstream_pending_delivery[item_name] = 0
    
    
    def set_cost(self, hb_cost):
        # set cost parameters of mart
        # cost might vary depending on the situation of each mart
        self.holding_cost = hb_cost[0]
        self.backlog_cost = hb_cost[1]    
    
    
    def get_reward(self, item_name):
        # a single reward for an item in a particular state
        # inventory cost = holding cost + backlog cost
        holding_cost = max(self.inventory[item_name][1]*self.holding_cost, min_cost[item_name])
        backlog_cost = self.backlog[item_name][1]*self.backlog_cost
        return -(holding_cost+backlog_cost)
    
    
    def get_rewards(self):
        # total reward of items in a particular state
        # total inventory cost of a mart
        rewards = [self.get_reward(item_name) for item_name in self.item]
        return sum(rewards)
    
    
    def get_cost(self):
        # total order cost in a particular state
        # total order cost of a mart
        cost = [self.cost[item_name]*self.outgoing_order[item_name][0] for item_name in self.item]
        return sum(cost)
    
    
    def current_state(self):
        # get the current state of an environment
        # the actual inventory is in the category betwenn -n_bins and n_bins
        states = {}
        for item_name in self.item:
            s = int((self.inventory[item_name][1]-self.backlog[item_name][1])*n_bins/self.max_inventory[item_name])
            states[item_name] = min(n_bins, max(-n_bins, s))
        return states
    
    
    def update_state(self):
        # state transition
        # get outgoing delivery for current state, inventory and backlog of next state
        for item_name in self.item:
            if self.incoming_order[item_name] >= self.inventory[item_name][0]+self.incoming_delivery[item_name]:
                self.outgoing_delivery[item_name] = self.inventory[item_name][0]+self.incoming_delivery[item_name]
                self.backlog[item_name][1] = self.backlog[item_name][0]+self.incoming_order[item_name]-self.outgoing_delivery[item_name]
                self.inventory[item_name][1] = 0
            
            elif self.incoming_order[item_name] <= self.inventory[item_name][0]+self.incoming_delivery[item_name]-self.backlog[item_name][0]:
                self.outgoing_delivery[item_name] = self.incoming_order[item_name]+self.backlog[item_name][0]
                self.backlog[item_name][1] = 0
                self.inventory[item_name][1] = self.inventory[item_name][0]+self.incoming_delivery[item_name]-self.outgoing_delivery[item_name]
            
            elif self.incoming_order[item_name] > self.inventory[item_name][0]+self.incoming_delivery[item_name]-self.backlog[item_name][0]:
                self.outgoing_delivery[item_name] = self.inventory[item_name][0]+self.incoming_delivery[item_name]
                self.backlog[item_name][1] = self.backlog[item_name][0]+self.incoming_order[item_name]-self.outgoing_delivery[item_name]
                self.inventory[item_name][1] = 0
    
    
    def clock_tick(self):
        # modify the state of the environment as per the action taken by the agent
        # return the cost of current state of environment        
        costs = {}
        for item_name in self.item:
            # get a cost of current state for the given action (outgoing order)
            costs[item_name] = self.get_reward(item_name)
            
            # get incoming delivery for the next cycle
            if random_incoming_delivery:
                self.incoming_delivery[item_name], self.upstream_pending_delivery[item_name] = random_delivery(self.outgoing_order[item_name][0], self.upstream_pending_delivery[item_name])
            else:
                self.incoming_delivery[item_name] = self.outgoing_order[item_name][0]
                
            # update next week values to current week
            self.backlog[item_name][0] = self.backlog[item_name][1]
            self.inventory[item_name][0] = self.inventory[item_name][1]
            
            # update outgoing order based on time delay
            for i in range(time_delay-1):
                self.outgoing_delivery[item_name][i] = self.outgoing_order[item_name][i+1]
        return costs
    
    
    def data_log(self):
        # values of current state
        log = {}
        s = 'State_{}'.format(self.name)
        for item_name in self.item:
            u_i = 'Current_Inventory_{}_{}'.format(self.name, item_name)
            b_o = 'Backlog_Orders_{}_{}'.format(self.name, item_name)
            i_d = 'Incoming_Delivery_{}_{}'.format(self.name, item_name)
            o_d = 'Outgoing_Delivery_{}_{}'.format(self.name, item_name)
            o_o = 'Outgoing_Order_{}_{}'.format(self.name, item_name)
            i_o = 'Incoming_Order_{}_{}'.format(self.name, item_name)
            c_i = 'Closing_Inventory_{}_{}'.format(self.name, item_name)
            c_b = 'Closing_Backlog_{}_{}'.format(self.name, item_name)
            p_d = 'Pending_Delivery_{}_{}'.format(self.name, item_name)
            
            log.update({
                s: self.current_state(),
                u_i: self.inventory[item_name][0],
                b_o: self.backlog[item_name][0],
                i_d: self.incoming_delivery[item_name],
                o_d: self.outgoing_delivery[item_name],
                o_o: self.outgoing_order[item_name][time_delay-1],
                i_o: self.incoming_order[item_name],
                c_i: self.inventory[item_name][1],
                c_b: self.backlog[item_name][1],
                p_d: self.upstream_pending_delivery[item_name]
                })
        return log

In [6]:
class SupplyChain:
    def __init__(self, beginning_budget, closing_budget):
        # create instance using class Vendor
        self.mart_A = Vendor('Atlanta', items_A, max_inventory_A)
        self.mart_B = Vendor('Boston', items_B, max_inventory_B)
        self.marts = [self.mart_A, self.mart_B]
        self.items = list(set(self.mart_A.item+self.mart_B.item))
        
        # one supplier manages multiple vendors with single budget
        # [current week, next week]
        self.beginning_budget = [beginning_budget, -1]
        self.closing_budget = [closing_budget, -1]
        self.total_cost = 0
        self.week = 0
    
    
    def budget_state(self):
        # get the current budget of environment
        return max(0, int(self, self.beginning_budget[1]))
    
    
    def update_budget(self):
        # budget state transition
        # compare budget and order cost
        cost = sum([mart.get_cost() for mart in self.marts])
        
        if self.beginning_budget-cost >= 0:
            self.closing_budget[0] = self.beginning_budget[0]-cost
            self.beginning_budget[1] = self.closing_budget[0]
        else:
            self.closing_budget[0] = 0
            self.beginning_budget[1] = self.closing_budget[0]
    
    
    def clock_tick(self):
        # modify the budget state of an environment
        self.beginning_budget[0] = self.beginning_budget[1]
        self.closing_budget[0] = self.closing_budget[1]
    
    
    def year_over(self):
        # one episode is the year-end number of weeks
        # return True or False
        return self.week == number_of_weeks
    
    
    def all_states(self):
        # get all the possible states of an environment
        return list(range(-n_bins, n_bins+1))
    
    
    def update_data_log(self, warehouse_log):
        # update the dataframe
        for mart in self.marts:
            self.total_cost += mart.get_rewards()
            
        log = {'Episode': episode, 'Total_Cost': abs(self.total_cost), 'Week': self.week}
        if train is False:
            log.update({'Beginning_Budget': self.beginning_budget[0], 'Closing_Budget':self.closing_budget[0]})
        
        for mart in self.marts:
            log.update(mart.data_log())
        warehouse_log = warehouse_log.append(log, ignore_index=True)
        return warehouse_log
            

In [7]:
def Warehouse():
    # create and return the environment for the reinforcement learning
    env = SupplyChain(beginning_budget=beginning_budget, closing_budget=closing_budget)
    
    # set holding cost and backlog cost
    env.mart_A.set_cost(hb_cost_A)
    env.mart_B.set_cost(hb_cost_B)
    return env

In [8]:
def random_delivery(delivery, pending):
    # return the number of deliveries from upstream and pending over
    # deliveries will be shipped randomly according to the circumstances of the stream
    delivery_new = np.random.randint(low=pending, high=delivery+pending+1)
    pending_new = delivery+pending-delivery_new
    return delivery_new, pending_new


def random_action(action, item_name, eps=0.1):
    # return a random action based on epsilon greedy algrithm
    # exploitation by 1-eps and exploration by eps
    p = np.random.random()
    if p < (1-eps):
        return action
    else:
        return np.random.choice(all_possible_actions[item_name])


def correct_action(action, item_name):
    # correct optimized action to the range of all posiible actions
    a = round(action, -1)
    if a <= 0:
        return 0
    elif a > all_possible_actions[item_name][-1]:
        return all_possible_actions[item_name][-1]
    else:
        return a


def get_incoming_order_stats(data):
    # get the mean and std of an item
    # exclude columns that causes errors with commas
    incoming_order_stats = {}
    df = pd.read_csv(data, error_bad_lines=False)
    for col in df.columns:
        c = item_dict[col.replace('Incoming_Order_', '')]
        mean, std = df[[col]].mean(), df[[col]].std()
        incoming_order_stats[c] = [mean[0], std[0]]
    return incoming_order_stats


def get_incoming_order(incoming_order_stats):
    # get the random customer order from normal distribution
    incoming_order = {}
    for k, v in incoming_order_stats.items():
        x = int(np.random.normal(v[0], v[1])) # mean, std
        incoming_order[k] = x
    return incoming_order


def initial_Q():
    # set initial Q values to 0 (Q-table)
    Q = np.zeros((len(all_states), len(all_possible_actions)))
    return Q


def read_Q(items):
    # read Q-table
    # Q values were stored seperately in the pickle file per item
    Q_dict = {}
    for item in items:
        filename = 'Q_A_{}.pickle'.format(item[0])
        with open(filename, 'rb') as file:
            Q = pickle.load(file)
            Q_dict[item[0]] = Q
    return Q_dict


def max_dict(d):
    # return the action which causes maximum value in certain state
    # action (argmax), value (max)
    max_key = np.argmax(d)
    max_value = np.max(d)
    return max_key, max_value

# exercise
arr = [0.3, 0.8, 1.1, 0.1]
max_dict(arr)

(2, 1.1)

In [9]:
# set initial Q values for all the states and actions
if train:
    Q_A = {items[0]: initial_Q() for items in items_A}
    Q_B = {items[0]: initial_Q() for items in items_B}
else:
    Q_A = read_Q(items_A)
    Q_B = read_Q(items_B)

# print(Q_A['orange'])
print(Q_A['orange'].shape)

(201, 5)


In [10]:
# updated_counts: to check how oftend Q(s) has been updated
# sa_count : state action count is used for apdaptive leaning rate
# set initial count to 1
warehouse = Warehouse()

sa_count_A, updated_counts_A = {}, {}
for name in warehouse.mart_A.item:
    sa_count_A[name], updated_counts_A[name] = {}, {}
    for s in all_states:
        sa_count_A[name][s] = {}
        for a in all_possible_actions[name]:
            sa_count_A[name][s][a] = 1
            
sa_count_B, updated_counts_B = {}, {}
for name in warehouse.mart_B.item:
    sa_count_B[name], updated_counts_B[name] = {}, {}
    for s in all_states:
        sa_count_B[name][s] = {}
        for a in all_possible_actions[name]:
            sa_count_B[name][s][a] = 1

# sample
print(sa_count_A['orange'][-100])
# print(sa_count_A['orange'][100])
print(updated_counts_A)

{0: 1, 10: 1, 20: 1, 30: 1, 40: 1, 50: 1, 60: 1, 70: 1, 80: 1}
{'orange': {}, 'egg': {}, 'hotdog': {}}


In [11]:
# get incoming order data with mean and std
if train:
    incoming_order_stats_A = get_incoming_order_stats(csv_A)
    incoming_order_stats_B = get_incoming_order_stats(csv_B)
else:
    incoming_order_stats_A = get_incoming_order_stats(csv_test_A)
    incoming_order_stats_B = get_incoming_order_stats(csv_test_B)

print(incoming_order_stats_A)
print(get_incoming_order(incoming_order_stats_A))

{'orange': [55.832432432432434, 25.95800585668644], 'egg': [55.31621621621622, 26.14524542580262], 'hotdog': [54.28058968058968, 26.608405933134595]}
{'orange': 68, 'egg': 51, 'hotdog': 71}


In [12]:
# generate dataframe
warehouse = Warehouse()

log = {'Episode': [], 'Week': [], 'Total_Cost': []}
for mart in warehouse.marts:
    s = 'State_{}'.format(mart.name)
    log.update({s:[]})
for mart in warehouse.marts:
    for name in mart.item:
        u_i = 'Current_Inventory_{}_{}'.format(mart.name, name)
        b_o = 'Backlog_Orders_{}_{}'.format(mart.name, name)
        i_d = 'Incoming_Delivery_{}_{}'.format(mart.name, name)
        o_d = 'Outgoing_Delivery_{}_{}'.format(mart.name, name)
        o_o = 'Outgoing_Order_{}_{}'.format(mart.name, name)
        i_o = 'Incoming_Order_{}_{}'.format(mart.name, name)
        c_i = 'Closing_Inventory_{}_{}'.format(mart.name, name)
        c_b = 'Closing_Backlog_{}_{}'.format(mart.name, name)
        p_d = 'Pending_Delivery_{}_{}'.format(mart.name, name)
        log.update({u_i: [], b_o: [], i_d: [], o_d: [], o_o: [], i_o: [], c_i: [], c_b: [], p_d: [],})
log = pd.DataFrame(log)

if train:
    warehouse_log = log.copy()
else:
    warehouse_log_test = log.copy()
    warehouse_log_test['Beginning_Budget'] = []
    warehouse_log_test['Closing_Budget'] = []

print(len(warehouse_log.columns))
print(len(warehouse.mart_A.data_log().keys())+len(warehouse.mart_B.data_log().keys())+3)

68
68


### Q-Learning

In [13]:
# n is for decaying epsilon
# count the number of over budget and updated action
start = time.time()
n, overbudget, updated = 1, 0, 0


# train or test for the number of episodes
for episode in tqdm(range(episodes)):
    
    # decaying epsilon for explore or exploit of choosing action
    if train:
        if episode%200 == 0:
            eps = 1/n
            n += 1
    else:
        eps = 0
        
    # initial environment setting when the episode is started
    warehouse = Warehouse()
    
    # get incoming order
    incoming_order_A = get_incoming_order(incoming_order_stats_A)
    incoming_order_B = get_incoming_order(incoming_order_stats_B)
    
    # update outgoing delivery and next inventory, backlog
    # inventory - order + demand
    warehouse.mart_A.update_state()
    warehouse.mart_B.update_state()
    
    # get current state of warehouse
    # inventory - backlog
    state_A = warehouse.mart_A.current_state()
    state_B = warehouse.mart_B.current_state()
    
    # get budget state
    if train is False:
        budget = warehouse.budget_state()
    
    # choose an action based on max Q value of current state
    a_A = {item_name: max_dict(Q_A[item_name][state_A[item_name]])[0] for item_name in warehouse.mart_A.item}
    a_B = {item_name: max_dict(Q_B[item_name][state_B[item_name]])[0] for item_name in warehouse.mart_B.item}
    
    # environment that changes over the week
    # loop until one episode is over
    while not warehouse.year_over():
        
        # get outgoing order and action for the current state        
        action_A = {item_name: random_action(a_A[item_name], item_name, eps) for item_name in warehouse.mart_A.item}
        action_B = {item_name: random_action(a_B[item_name], item_name, eps) for item_name in warehouse.mart_B.item}
        
        for item_name in warehouse.mart_A.item:
            warehouse.mart_A.outgoing_order[item_name][time_delay-1] = action_A[item_name]
        for item_name in warehouse.mart_B.item:
            warehouse.mart_B.outgoing_order[item_name][time_delay-1] = action_B[item_name]
        
        #### update action by budget
        
        # update the warehouse dataframe
        if train:
            warehouse_log = warehouse.update_data_log(warehouse_log)
        else:
            warehouse_log_test = warehouse.update_data_log(warehouse_log_test)
        
        
        # get cost of current week and the update next week values as current week
        r_A = warehouse.mart_A.clock_tick()
        r_B = warehouse.mart_B.clock_tick()
        
        # increment week to take the current state to next state or time period
        warehouse.week += 1
        
        # get incoming order
        incoming_order_A = get_incoming_order(incoming_order_stats_A)
        incoming_order_B = get_incoming_order(incoming_order_stats_B)
        
        # update outgoing delivery and next inventory, backlog
        # inventory - order + demand
        warehouse.mart_A.update_state()
        warehouse.mart_B.update_state()
        
        # update next week budget as current week
        if train is False:
            warehouse.clock_tick()
        
        # get next state of warehouse
        # inventory - backlog
        state1_A = warehouse.mart_A.current_state()
        state1_B = warehouse.mart_B.current_state()
        
        # get next budget state
        if train is False:
            budget1 = warehouse.budget_state()
        
        # get next action since Q(s, a) depends on Q(s', a')
        # if s' not in policy then it's a terminal state which means all Q values are 0
        # the difference between SARSA if that Q-Learning uses max[a']{Q(s', a')} in update
        # even it we do not end up taking this action in the next step
        a1_A = {item_name: max_dict(Q_A[item_name][state1_A[item_name]])[0] for item_name in warehouse.mart_A.item}
        a1_B = {item_name: max_dict(Q_B[item_name][state1_B[item_name]])[0] for item_name in warehouse.mart_B.item}
        
        max_Q1_A = {item_name: max_dict(Q_A[item_name][state1_A[item_name]])[1] for item_name in warehouse.mart_A.item}
        max_Q1_B = {item_name: max_dict(Q_B[item_name][state1_B[item_name]])[1] for item_name in warehouse.mart_B.item}
        
        
        # Q[s, a] = Q[s, a]+alpha*(r+gamma*max[a']){Q{s', a'}-Q[s, a]}
        # use alpha as adaptive learning rate like AdaGrad and RMSprop in DNN
        # in this way when epsilon decreases for each episode it may miss the states which have never occur
        # adaptive alpha will be high for such states and hence keeping the balance
        if train:
            for item_name in warehouse.mart_A.item:
                sa_count, s, action, a = sa_count_A[item_name], state_A[item_name], action_A[item_name], a_A[item_name]
                Q, r, max_Q1 = Q_A[item_name], r_A[item_name], max_Q1_A[item_name]
                
                sa_count[s][action] += 0.005
                Q[s][a] = Q[s][a]+(alpha/sa_count[s][action])*(r+gamma*max_Q1-Q[s][a])
                updated_counts_A[item_name][s] = updated_counts_A[item_name].get(s, 0)+1
            
            for item_name in warehouse.mart_B.item:
                sa_count, s, action, a = sa_count_B[item_name], state_B[item_name], action_B[item_name], a_B[item_name]
                Q, r, max_Q1 = Q_B[item_name], r_B[item_name], max_Q1_B[item_name]
                
                sa_count[s][action] += 0.005
                Q[s][a] = Q[s][a]+(alpha/sa_count[s][action])*(r+gamma*max_Q1-Q[s][a])
                updated_counts_B[item_name][s] = updated_counts_B[item_name].get(s, 0)+1
        
        
        # set next state as current state
        state_A, state_B = state1_A, state1_B
        
        # update next budget as current budget
        if train is False:
            budget = budget1
        
        # set next action as current action
        a_A, a_B = a1_A, a1_B


end = time.time()

100%|██████████| 50/50 [00:20<00:00,  2.41it/s]


In [14]:
# if train:
#     # determine the policy from Q*
#     # initialize policy, V
#     policy, V = {}, {}
#     for s in range(-n_bins, n_bins+1):
#         policy[s] = all_possible_actions[max_dict(Q[s])[0]]

#     # what's the proportion of time we spend updating each part of Q?
#     #print("update counts:")
#     total = np.sum(list(update_counts.values()))
#     for k in update_counts.keys():
#         update_counts[k] =  round(float(update_counts[k])*100 / total, 2)
#     #print(update_counts)
#     print('Total States Updated:', len(update_counts), 'out of:', len(states))
#     print('Action Space Size:', len(all_possible_actions)*len(states))
    
#     with open('policy.pickle', 'wb') as file:
#         pickle.dump(policy, file)
    
#     with open('Q_value.pickle', 'wb') as file:
#         pickle.dump(Q, file)

In [15]:
# check simulation time and save dataframe
print('Time:', end-start)

if train:
    warehouse_log.to_csv(output_DIR+'warehouse_log.csv')
else:
    print('over budget:', overbudget)
    print('updated action:', updated)
    warehouse_log_test.to_csv(output_DIR+'warehouse_log_test.csv')

Time: 20.767979860305786
