In [205]:
# =====> MDP file


from gym import spaces
import numpy as np

capacity = 50 # max capaciy of warhouse
transport_cost = 5  # costant part of order cost, can be fuel cost
variable_cost = 4  # variable part of order cost
holding_cost = 0.0025 # holiding cost of inventory
selling_price = 4.2 # including the 5% profit
return_cost = transport_cost


lamda_mon = 16
lamda_tue = 31
lamda_wed = 15
lamda_thu = 32
lamda_fri = 30
lamda_sat = 8
lamda_sun = 42

day_mapping = {
    0: lamda_mon,
    1: lamda_tue,
    2: lamda_wed,
    3: lamda_thu,
    4: lamda_fri,
    5: lamda_sat,
    6: lamda_sun
}
daily_mapping = {0: 'mon', 1: 'tue', 2: 'wed', 3: 'thu', 4: 'fir', 5:  'sat', 6: 'sun'}


In [206]:
min(3, 5, 7)

3

In [245]:
class InventoryEnv():
    def __init__(self):
        self.action_space = spaces.Discrete(capacity + 1)
        self.inventory = np.random.choice(np.arange(0, capacity + 1))
        self.day = np.random.choice((0, 1, 2, 3, 4, 5, 6))
        self.state = (self.inventory, self.day)
        self.reset()

    def demand(self, day):
        return np.random.poisson(day_mapping.get(day))

    def transition(self, current_state, action_taken, demand):
        stock_after_sales = max(current_state[0] - demand, 0)
        # ===> stock by EOD after sales
        stock_eod = min(stock_after_sales + action_taken, capacity)

        if (current_state[1] < 6):
            next_day = current_state[1] + 1
        else:
            next_day = 0

        return (stock_eod, next_day)

    def reward(self, inventory, action_taken, demand):
        # ===> expected income
        expected_income = selling_price * min(inventory, demand)
        # ===> order cost
        order_cost = (transport_cost *
                      (action_taken > 0)) + variable_cost * action_taken
        # ===> holding cost
        total_holding_cost = holding_cost * inventory
        # ===> opportunity cost

        profit = selling_price - variable_cost
        opportunity_cost = profit * (demand - inventory) * (demand > inventory)
        # ===> return cost
        stock_after_sales = inventory - demand
        stock_arrived = action_taken
        total_return_cost = return_cost * (stock_after_sales + stock_arrived >
                                     capacity)
        # ===> money back

        money_back = variable_cost * (
            stock_after_sales + stock_arrived -
            capacity) * (stock_after_sales + stock_arrived > capacity)

        reward = expected_income - order_cost - holding_cost - opportunity_cost - total_return_cost + money_back
        return reward

    def initial_step(self, current_state, action):
        assert self.action_space.contains(action)     #to check that action is a discrete value less than m
        if current_state[1]<6:
            demand = self.demand(current_state[1]+1)    
        else:
            demand = self.demand(0)        
        next_state = self.transition(current_state, action, demand)       #next_state
        return next_state
        
    def step(self, current_state, action):
        assert self.action_space.contains(action)
        
        if (current_state[1] < 6):
            next_day_demand = self.demand(current_state[1] + 1)
        else:
            next_day_demand = self.demand(0)
        
        next_state = self.transition(current_state, action, next_day_demand)
        reward = self.reward(current_state[0], action, next_day_demand)
        
        return next_state, reward
        
        
    def reset(self):
        return self.state

# Q Learning using MDP above:

In [246]:
import collections
import pickle
import numpy as np
import pandas as pd
import random

In [247]:
Q_dict = collections.defaultdict(dict)
States_track = collections.defaultdict(dict)
rewards_tracked = {(15,0):[],(25,1):[], (20,2): [], (30,3):[], (35,4):[], (10,5):[],(50,6):[]}


print(len(Q_dict))
print(len(rewards_tracked))
print(len(States_track))

0
7
0


In [248]:
with open('Results_15million/Policy.pkl', 'rb') as handle:
    Q_dict = pickle.load(handle)

with open('Results_15million/Rewards.pkl', 'rb') as handle:
    rewards_tracked = pickle.load(handle)
    
with open('Results_15million/States_tracked.pkl', 'rb') as handle:
    States_track = pickle.load(handle)

    
print(len(Q_dict))
print(len(rewards_tracked))
print(len(States_track))

357
7
7


In [249]:
def valid_actions(state):
    state = state[0]
    valid_Actions = []
    for ix in range(0, capacity+1):
        valid_Actions.append(ix)
    return valid_Actions

In [250]:
def add_to_dict(state,valid_act):
    if state not in Q_dict.keys():
        Q_dict[state] = {}
        for action in valid_act:
            Q_dict[state][action] = 0.0

In [251]:
def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [252]:
def initiali_tracking_states():
    # Q values ((current_state_inventory, day), action)
    sample_q_values = [((15, 0), 12), ((25, 1), 25), ((20, 2), 10),
                       ((30, 3), 30), ((35, 4), 15), ((10, 5), 18),
                       ((50, 6), 10)]
    for q_value in sample_q_values:
        state = q_value[0]
        action = q_value[1]
        States_track[state][action] = []

In [253]:
def save_tracking_states():
    for state in States_track.keys():
        for action in States_track[state].keys():
            if state in Q_dict and action in Q_dict[state]:
                States_track[state][action].append(Q_dict[state][action])

In [254]:
def epsilon_greedy(state, time):
    epsilon = - 1/ (1 + np.exp((-time+7500000)/1700000)) + 1
    z = np.random.random()
    if z > epsilon:
        # Exploitation
        action = max(Q_dict[state],key=Q_dict[state].get)
    else:
        # Exploration        
        action = np.random.choice((np.arange(0, capacity + 1)))
    return action

### Training

In [255]:
EPISODES = 15000000
STEPS = 2
# STEPS = 30                 #for 30 days
LR = 0.01                   #learning rate
GAMMA = 0.91


threshold = 2000      
policy_threshold = 30000 

In [256]:
for episode in range(6393884, EPISODES):
    env = InventoryEnv()
    initial_state = env.state
    curr_state = initial_state

    add_to_dict(curr_state, valid_actions(curr_state))
    time_step = 0
    reward = None
    curr_action = np.random.choice(np.arange(0, capacity + 1))

    next_state = env.initial_step(curr_state, curr_action)
    add_to_dict(next_state, valid_actions(next_state))

    curr_state = next_state
    total_reward = 0
    
    while time_step < STEPS:
        #  Exploitation vs exploration
        curr_action = epsilon_greedy(curr_state, episode)
        next_state, reward = env.step(curr_state, curr_action)

        add_to_dict(next_state, valid_actions(next_state))

        max_next = max(Q_dict[next_state], key=Q_dict[next_state].get)

        Q_dict[curr_state][curr_action] += LR * (
            (reward + (GAMMA * (Q_dict[next_state][max_next]))) -
            Q_dict[curr_state][curr_action])
        curr_state = next_state
        
        total_reward += reward
        
        time_step += 1
    #TRACKING REWARDS
    if initial_state in rewards_tracked:     #storing rewards
        rewards_tracked[initial_state].append(total_reward)
        #save_obj(rewards_tracked,'Rewards')

    if ((episode+1) % threshold) == 0:   #every 2000th episode
        save_obj(rewards_tracked,'Rewards')   
    
    #TRACKING Q-VALUES
    if (episode == threshold-1):        #at the 1999th episode
        initialise_tracking_states()
      
    if ((episode+1) % threshold) == 0:   #every 2000th episode
        save_tracking_states()
        save_obj(States_track,'States_tracked')   
    
    #SAVING POLICY
    if ((episode+1)% policy_threshold ) == 0:  #every 30000th episodes, the Q-dict will be saved
        save_obj(Q_dict,'Policy')    

save_obj(rewards_tracked,'Rewards')   
save_obj(States_track,'States_tracked')   
save_obj(Q_dict,'Policy')      
# print(episode)

In [257]:
save_obj(rewards_tracked,'Rewards')   
save_obj(States_track,'States_tracked')   
save_obj(Q_dict,'Policy')    

In [183]:
valid_actions((30, 3))

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50]