In [1]:
from dataclasses import dataclass
from rl.chapter3.simple_inventory_mdp_cap import InventoryState, SimpleInventoryMDPCap
from rl.distribution import Choose, Categorical, Constant, Distribution
from rl.dynamic_programming import policy_iteration_result
from rl.markov_process import NonTerminal
from rl.markov_decision_process import FiniteMarkovDecisionProcess, MarkovDecisionProcess
from rl.policy import RandomPolicy, DeterministicPolicy, UniformPolicy
from typing import Dict, Tuple, TypeVar


import itertools
import matplotlib.pyplot as plt
import numpy as np

S = TypeVar('S')
A = TypeVar('A')

In [2]:
@dataclass
class DynamicLearningRate:
    alpha: float
    beta: float
    H: int

    def evaluate(self, n: int):
        return self.alpha / (1 + ((n - 1) / self.H) ** (self.beta))


@dataclass
class TabularQLearning:
    gamma: float
    mdp: MarkovDecisionProcess[S, A]
    init_state_distribution: Distribution[S]
    dlr: DynamicLearningRate

    def Q_learning(self, n_steps: int = 10_000):
        mu_policy = UniformPolicy(lambda s: self.mdp.actions(NonTerminal(s)))

        Q_value_func = {
            (state, action): 0.0
            for state in self.mdp.non_terminal_states
            for action in self.mdp.actions(state)
        }

        state_action_counter = {
            (state, action): 0
            for state in si_mdp.non_terminal_states
            for action in si_mdp.actions(state)
        }

        state: NonTerminal[S] = self.init_state_distribution.sample()
        step: int = 0
        while isinstance(state, NonTerminal) and step < n_steps:
            yield Q_value_func
            action = mu_policy.act(state).sample()

            state_action_counter[(state, action)] += 1
            next_state, reward = self.mdp.step(state, action).sample()
            max_q_return = max(
                [Q_value_func[(next_state, a)] for a in self.mdp.actions(next_state)]
            )
            Q_value_func[(state, action)] += self.dlr.evaluate(state_action_counter[(state, action)]) * (
                reward + self.gamma * max_q_return - Q_value_func[(state, action)]
            )
            step += 1
            state = next_state


In [3]:
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] = SimpleInventoryMDPCap(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost,
)


init_dist = Choose(si_mdp.non_terminal_states)
dlr = DynamicLearningRate(alpha = 0.03, beta = 0.5, H = 10_000)

tql = TabularQLearning(gamma=0.9, mdp = si_mdp, init_state_distribution=init_dist, dlr=dlr)

for q_func in tql.Q_learning(n_steps = 100_000):
    pass

q_func

{(NonTerminal(state=InventoryState(on_hand=0, on_order=0)),
  0): -41.25969540088942,
 (NonTerminal(state=InventoryState(on_hand=0, on_order=0)),
  1): -34.719251536030235,
 (NonTerminal(state=InventoryState(on_hand=0, on_order=0)),
  2): -34.97607893474551,
 (NonTerminal(state=InventoryState(on_hand=0, on_order=1)),
  0): -31.17706975885415,
 (NonTerminal(state=InventoryState(on_hand=0, on_order=1)),
  1): -27.42196901248525,
 (NonTerminal(state=InventoryState(on_hand=0, on_order=2)),
  0): -27.887500023927497,
 (NonTerminal(state=InventoryState(on_hand=1, on_order=0)),
  0): -32.88645535319848,
 (NonTerminal(state=InventoryState(on_hand=1, on_order=0)),
  1): -28.590209779474293,
 (NonTerminal(state=InventoryState(on_hand=1, on_order=1)),
  0): -28.992961832671966,
 (NonTerminal(state=InventoryState(on_hand=2, on_order=0)),
  0): -29.841091248318016}