In [1]:
# this notebook explores the optimal Reinforcement Learning steady state policy given a fixed poisson demand
# by using markov chain properties and value iterations. 
# This is used as a benchmark to see how well our actor critic Agent performs

In [2]:
from dual_sourcing_game import DualSourcing
import numpy as np
import matplotlib.pyplot as plt


Matplotlib created a temporary config/cache directory at /var/folders/_x/fh3t8wcj3_xbs4fddcxmbpc00000gn/T/matplotlib-2akz9u6e because the default path (/Users/hansshen/.matplotlib) is not a writable directory; it is highly recommended to set the MPLCONFIGDIR environment variable to a writable directory, in particular to speed up the import of Matplotlib and to better support multiprocessing.


In [13]:
class value_iter:
    def __init__(self,dual_sourcing_env,lr,delta_prob_matrix=0.01,delta_val=0.1):
        self.env=dual_sourcing_env
        self.lr=lr
        #state_space[0]=[-max_inven,[0]*rl,[0]*el],[1]=[-max_inven,[0]*rl,[0]*(el-1)+[1]]
        self.state_space_dim=(self.env.max_inventory*2+1)*(self.env.max_order+1)**(self.env.rl+self.env.el)
        self.action_space_dim=self.state_space_dim*(self.env.max_order+1)**(2)
        self.markov_matrix=[[0]*self.action_space_dim for _ in range(self.state_space_dim)]
        #delta_prob_matrix is the error term associated with the markov matrix
        total_prob=0
        X=0
        demand=[]
        while(1-total_prob>delta_prob_matrix):
            prob=np.exp(-self.env.poisson_lambda)*np.power(self.env.poisson_lambda,X)/np.math.factorial(X)
            demand.append([prob,self.env.fixed_demand+X])
            total_prob+=prob
            X+=1
        self.demand=demand
        
            
    #auxiliary function:
    #converting between index in state_space and [inventory,reg pipeline vector,exp pipeline vector] format
    def state_index(self,state):
        index=(state[0]+self.env.max_inventory)*(self.env.max_order+1)**(self.env.rl+self.env.el)
        index+=sum(state[1][i]*(self.env.max_order+1)**(self.env.el+self.env.rl-1-i) for i in range(self.env.rl))
        index+=sum(state[2][i]*(self.env.max_order+1)**(self.env.el-i-1) for i in range(self.env.el))
        return index
    def state_representation(self,index):
        inv=index//((self.env.max_order+1)**(self.env.rl+self.env.el))-self.env.max_inventory
        index=index%((self.env.max_order+1)**(self.env.rl+self.env.el))
        reg_orders=[0]*self.env.rl
        for i in range(self.env.rl):
            reg_orders[i]=index//(self.env.max_order+1)**(self.env.el+self.env.rl-1-i)
            index=index%(self.env.max_order+1)**(self.env.el+self.env.rl-1-i)
        exp_orders=[0]*self.env.el
        for i in range(self.env.el):
            exp_orders[i]=index//(self.env.max_order+1)**(self.env.el-i-1)
            index=index%(self.env.max_order+1)**(self.env.el-i-1)
        return [inv,reg_orders,exp_orders]
    def generate_state(self):
        return self.generate_state_driver(self,[],[list(range(-self.))])
    def generate_state_driver(self,cur,data):
        if not data:
            yield cur
        else:
            for elem in data[0]:
                for rest in self.generate_state_driver(cur,data[1:])
                    yield [elem]+rest
            

In [11]:
config={'regular_leadtime':3,'express_leadtime':1,'regular_cost':3,'express_cost':4,"max_order":3,"max_inventory":10,
       'store_cost':2.6,'back_cost':5,'y':0.95,'starting_state':[5,[0 for _ in range(3)],[0 for _ in range(1)]]}
game=DualSourcing(config,episode_len=5000)
v=value_iter(game,1)