In [1]:
from dataclasses import dataclass
from rl.chapter3.simple_inventory_mdp_cap import InventoryState, SimpleInventoryMDPCap
from rl.distribution import Choose, Categorical, Constant
from rl.dynamic_programming import policy_iteration_result
from rl.markov_process import NonTerminal
from rl.markov_decision_process import FiniteMarkovDecisionProcess, MarkovDecisionProcess
from rl.policy import RandomPolicy, DeterministicPolicy, UniformPolicy
from typing import Dict, Tuple


import itertools
import matplotlib.pyplot as plt
import numpy as np

In [2]:
@dataclass
class DynamicLearningRate:
    alpha: float
    beta: float
    H: int

    def evaluate(self, n: int):
        return self.alpha / (1 + ((n - 1) / self.H) ** (self.beta))

class TabularMonteCarloGLIE:
    mdp: MarkovDecisionProcess
    pass

In [3]:
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] = SimpleInventoryMDPCap(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost,
)


init_dist = Choose(si_mdp.non_terminal_states)


In [4]:
def assemble_eps_policy(
    k: int,
    Q_value_func: Dict[Tuple[NonTerminal[InventoryState], int], float],
    mdp: MarkovDecisionProcess,
):
    unif_policy = UniformPolicy(lambda s: si_mdp.actions(NonTerminal(s)))
    argmax_policy_mapping = {}
    eps = 1 / float(k)
    for state in mdp.non_terminal_states:

        act_vector = np.array([action for action in mdp.actions(state)])
        A = len(act_vector)

        rets = np.array([Q_value_func[(state, action)] for action in act_vector])
        opt_act = act_vector[np.argmax(rets)]

        argmax_policy_mapping[state] = opt_act
    argmax_policy = DeterministicPolicy(lambda s: argmax_policy_mapping[NonTerminal(s)])

    policy_mapping = Categorical({argmax_policy: 1 - eps, unif_policy: eps})
    return RandomPolicy(policy_mapping)


In [13]:
k = 1

tol = 1.0e-10
gamma = 0.9

num_steps: int = np.ceil(np.log(tol) / np.log(gamma)).astype(int)
num_updates = 4_000

trace_length = num_updates + num_steps
k_lim = 200

dlr = DynamicLearningRate(alpha = 0.03, beta = 0.5, H = 1_000)

In [14]:
Q_value_func = {
    (state, action): 0.0
    for state in si_mdp.non_terminal_states
    for action in si_mdp.actions(state)
}

state_action_counter = {
    (state, action): 0
    for state in si_mdp.non_terminal_states
    for action in si_mdp.actions(state)
}


while k < k_lim:

    pol = assemble_eps_policy(k=k, Q_value_func=Q_value_func, mdp=si_mdp)
    trace = si_mdp.simulate_actions(start_states=init_dist, policy=pol)

    trace_vector = list(itertools.islice(trace, trace_length))
    reward_vector = np.array([s.reward for s in trace_vector])
    state_action_vector = [(s.state, s.action) for s in trace_vector]

    gamma_arr = np.array([gamma ** k for k in np.arange(trace_length)])
    discounted_reward = np.flip(
        np.cumsum(np.flip(np.multiply(reward_vector, gamma_arr)))
    )
    G = np.multiply(discounted_reward, 1 / gamma_arr)

    for idx, state_action in enumerate(state_action_vector[0:num_updates]):
        state_action_counter[state_action] += 1
        Q_value_func[tuple(state_action)] += (
            dlr.evaluate(state_action_counter[state_action])
        ) * (G[idx] - Q_value_func[state_action])

    k += 1


In [15]:
Q_value_func


{(NonTerminal(state=InventoryState(on_hand=0, on_order=0)),
  0): -41.76730144415136,
 (NonTerminal(state=InventoryState(on_hand=0, on_order=0)),
  1): -34.864994559823174,
 (NonTerminal(state=InventoryState(on_hand=0, on_order=0)),
  2): -35.80445346068443,
 (NonTerminal(state=InventoryState(on_hand=0, on_order=1)),
  0): -31.97826805620582,
 (NonTerminal(state=InventoryState(on_hand=0, on_order=1)),
  1): -27.427378392670715,
 (NonTerminal(state=InventoryState(on_hand=0, on_order=2)),
  0): -28.463053851294546,
 (NonTerminal(state=InventoryState(on_hand=1, on_order=0)),
  0): -32.94571005887549,
 (NonTerminal(state=InventoryState(on_hand=1, on_order=0)),
  1): -28.635090707475776,
 (NonTerminal(state=InventoryState(on_hand=1, on_order=1)),
  0): -28.84214548415869,
 (NonTerminal(state=InventoryState(on_hand=2, on_order=0)),
  0): -29.69701922603516}

In [8]:
opt_vf_pi, opt_policy_pi = policy_iteration_result(
    si_mdp,
    gamma=gamma
)

print(opt_policy_pi)

For State InventoryState(on_hand=0, on_order=0): Do Action 1
For State InventoryState(on_hand=0, on_order=1): Do Action 1
For State InventoryState(on_hand=0, on_order=2): Do Action 0
For State InventoryState(on_hand=1, on_order=0): Do Action 1
For State InventoryState(on_hand=1, on_order=1): Do Action 0
For State InventoryState(on_hand=2, on_order=0): Do Action 0



In [9]:
opt_vf_pi

{NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -34.894855781630035,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.660960231637507,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -27.991900091403533,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.660960231637507,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -28.991900091403533,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -29.991900091403533}