In [1]:
from dataclasses import dataclass
from typing import Tuple, Dict, Mapping, Tuple
from rl.dynamic_programming import policy_iteration_result
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.policy import FiniteDeterministicPolicy
from rl.markov_process import FiniteMarkovProcess, FiniteMarkovRewardProcess
from rl.distribution import Categorical
from scipy.stats import poisson


In [2]:
@dataclass(frozen=True)
class InventoryState:
    on_hand_1: int
    on_order_1: int

    on_hand_2: int
    on_order_2: int

    def inventory_position(self) -> Tuple[int, int]:
        return (self.on_hand_1 + self.on_order_1, self.on_hand_2 + self.on_order_2)

    def ip1(self) -> int:
        return self.on_hand_1 + self.on_order_1

    def ip2(self) -> int:
        return self.on_hand_2 + self.on_order_2


InvOrderMapping = Mapping[
    InventoryState, Mapping[int, Categorical[Tuple[InventoryState, float]]]
]


class DoubleInventoryMDPCap(FiniteMarkovDecisionProcess[InventoryState, int]):
    def __init__(
        self,
        capacities: Tuple[int, int],
        poisson_lambdas: Tuple[float, float],
        holding_costs: Tuple[float, float],
        stockout_costs: Tuple[float, float],
        order_cost: float,
        transfer_cost: float,
    ):
        self.capacities: Tuple[int, int] = capacities
        self.holding_costs: Tuple[float, float] = holding_costs
        self.stockout_costs: Tuple[float, float] = stockout_costs
        self.order_cost: float = order_cost
        self.transfer_cost: float = transfer_cost

        self.poisson_lambdas = poisson_lambdas
        self.poisson_distr_1 = poisson(self.poisson_lambdas[0])
        self.poisson_distr_2 = poisson(self.poisson_lambdas[1])
        super().__init__(self.get_action_transition_reward_map())

    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[
            InventoryState,
            Dict[Tuple[int, int], Categorical[Tuple[InventoryState, float]]],
        ] = {}

        for alpha1 in range(self.capacities[0] + 1):
            for beta1 in range(self.capacities[0] + 1 - alpha1):
                for alpha2 in range(self.capacities[1] + 1):
                    for beta2 in range(self.capacities[1] - alpha2):
                        state: InventoryState = InventoryState(
                            alpha1, beta1, alpha2, beta2
                        )

                        ip: Tuple[int, int] = state.inventory_position()

                        base_reward: float = (
                            -self.holding_costs[0] * alpha1
                            - self.holding_costs[1] * alpha2
                        )
                        # will be held overnight
                        d1: Dict[
                            Tuple[int, int], Categorical[Tuple[InventoryState, float]]
                        ] = {}

                        # transfer refers to the amount transferred from the perspective of shop 1
                        for transfer in range(-alpha1, alpha2 + 1):

                            for order1 in range(
                                self.capacities[0]
                                - state.inventory_position()[0]
                                - transfer
                                + 1
                            ):
                                for order2 in range(
                                    self.capacities[1]
                                    - state.inventory_position()[1]
                                    + transfer
                                    + 1
                                ):
                                    running_reward = (
                                        base_reward
                                        - transfer_cost * (transfer != 0)
                                        - order_cost * (order1 != 0 + order2 != 0)
                                    )

                                    sr_probs_dict: Dict[
                                        Tuple[InventoryState, float], float
                                    ] = {
                                        (
                                            InventoryState(
                                                ip[0] + transfer - i,
                                                order1,
                                                ip[1] - transfer - j,
                                                order2,
                                            ),
                                            running_reward,
                                        ): self.poisson_distr_1.pmf(i)
                                        * self.poisson_distr_2.pmf(j)
                                        for i in range(ip[0] - transfer)
                                        for j in range(ip[1] + transfer)
                                    }

                                    #### Covering cases of stockout ####
                                    stockout_prob_1: float = (
                                        1
                                        - self.poisson_distr_1.cdf(ip[0] - transfer - 1)
                                    )
                                    stockout_prob_2: float = (
                                        1
                                        - self.poisson_distr_2.cdf(ip[1] + transfer - 1)
                                    )
                                    expected_stockout_cost_1 = self.stockout_costs[
                                        0
                                    ] * (
                                        stockout_prob_1
                                        * (self.poisson_lambdas[0] - ip[0])
                                        + ip[0] * self.poisson_distr_1.pmf(ip[0])
                                    )

                                    expected_stockout_cost_2 = self.stockout_costs[
                                        1
                                    ] * (
                                        stockout_prob_2
                                        * (self.poisson_lambdas[1] - ip[1])
                                        + ip[1] * self.poisson_distr_1.pmf(ip[1])
                                    )

                                    #### First stockout but not the second ####
                                    sr_probs_dict.update(
                                        {
                                            (
                                                InventoryState(
                                                    0,
                                                    order1,
                                                    ip[1] - transfer - j,
                                                    order2,
                                                ),
                                                running_reward
                                                - expected_stockout_cost_1,
                                            ): stockout_prob_1
                                            * self.poisson_distr_2.pmf(j)
                                            for j in range(ip[1] - transfer)
                                        }
                                    )

                                    #### Second stockout but not the first ####
                                    sr_probs_dict.update(
                                        {
                                            (
                                                InventoryState(
                                                    ip[0] + transfer - i,
                                                    order1,
                                                    0,
                                                    order2,
                                                ),
                                                running_reward
                                                - expected_stockout_cost_2,
                                            ): stockout_prob_2
                                            * self.poisson_distr_1.pmf(i)
                                            for i in range(ip[0] + transfer)
                                        }
                                    )

                                    #### Both stock out ####

                                    sr_probs_dict[
                                        InventoryState(0, order1, 0, order2),
                                        running_reward
                                        - expected_stockout_cost_1
                                        - expected_stockout_cost_2,
                                    ] = (
                                        stockout_prob_1 * stockout_prob_2
                                    )

                                    d1[(order1, order2, transfer)] = Categorical(
                                        sr_probs_dict
                                    )
                        d[state] = d1
            return d


In [3]:
capacities = (7, 10)
poisson_lambdas = (3, 1.5)
holding_costs = (2.0, 1.0)
stockout_costs = (10.0, 7.5)
order_cost = 4.0
transfer_cost = 1.0

di_mdp: FiniteMarkovDecisionProcess[InventoryState, int] = DoubleInventoryMDPCap(
    capacities=capacities,
    poisson_lambdas=poisson_lambdas,
    holding_costs=holding_costs,
    stockout_costs=stockout_costs,
    order_cost=order_cost,
    transfer_cost=transfer_cost
)


In [4]:
policy_iteration_result(mdp=di_mdp, gamma=0.9)

({NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=0)): -42.066730125012384,
  NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=1)): -34.165572948084204,
  NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=2)): -32.34076286173448,
  NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=3)): -32.10159806366085,
  NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=4)): -33.19618791865703,
  NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=5)): -34.78772859479463,
  NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=6)): -35.22600411650624,
  NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=7)): -35.918955402225066,
  NonTerminal(state=InventoryState(on_hand_1=0, on_order_1=0, on_hand_2=0, on_order_2=8)): -36.7396821390842,