# Assignment 3
https://github.com/joshkmartinez/RL-book/tree/master/A3

# 1

$$ S = \mathbb{R}
\newline
A = \mathbb{R}
\newline
s'\sim N(s,\sigma^2)
\newline
\text{Goal: Minimize this infinite-horizon Expected Discounted-Sum of Costs}
\newline
C = \sum \gamma e^{as'}
\newline
\text{Myopic case: } \gamma = 0
\newline
C = e^{as'}
\newline
\text{and so: } E[C] = E[e^{as'}]
\newline
as' \sim N(as,a^2\sigma^2)
\newline
E[C]=e^{as+((a^2\sigma^{2})/2)}
\newline
\text{Minimizing the expectation:}
\newline
\underset{A}{\mathrm{min}} \text{ } as + ((a^2\sigma^{2})/2)
\newline
\frac{\partial}{\partial a} ( as + ((a^2\sigma^{2})/2) ) = s+a\sigma^2
\newline
\therefore \text{the optimal action is } a=-s/\sigma^2
\newline \newline
\text{Plugging this back into the expected cost: } e^{(-s^2/\sigma^2)+\frac{s^2\sigma^2}{\sigma^4}}
\newline
E[C] = e^{\frac{-s^2}{2\sigma^2}}
$$

# 2
$${B}^*(V)(s)=\max_{a \in A}R(s, a)+\gamma \sum_{s' \in N} P(s, a, s') \cdot V(s')
\newline
V_0(s) =
\begin{bmatrix}
        10\\
        1\\
        0
\end{bmatrix}
\newline
\gamma = 1
\newline
\text{First 2 value iteration updates:}
\newline
V_1(s) = \begin{bmatrix}
        \max_{a \in A}R(s_1, a)+\gamma \sum_{s' \in N} P(s_1, a, s') \cdot V(s')\\
        \max_{a \in A}R(s_2, a)+\gamma \sum_{s' \in N} P(s_2, a, s') \cdot V(s')\\
        0
    \end{bmatrix} = \begin{bmatrix}
        11.2\\
        4.3\\
        0
\end{bmatrix}
\newline
V_2(s) = \begin{bmatrix}
        \max_{a \in A}R(s_1, a)+\gamma \sum_{s' \in N} P(s_1, a, s') \cdot V(s')\\
        \max_{a \in A}R(s_2, a)+\gamma \sum_{s' \in N} P(s_2, a, s') \cdot V(s')\\
        0
    \end{bmatrix} = \begin{bmatrix}
        12.82\\
        5.89\\
        0
\end{bmatrix}
\newline \newline
\pi_k \text{for } k>2 \text{ is the same as } \pi_2 
\newline
\text{Since } V_t 
\text{ is increasing, the transition probabilities from both states show that this will lead the optimal policy to stay the same.}
\newline
0.1(11.2)+0.4(4.3)>2
\newline
0.2(11.2)>2
\newline
\therefore \pi_2 \text{is the ODP}
$$

# 4

In [41]:
import sys
sys.path.append("../")
import numpy as np
from time import time 
from scipy.stats import poisson
import matplotlib.pyplot as plt
from dataclasses import dataclass
from typing import Mapping, Dict, Tuple
from rl.distribution import Categorical
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.dynamic_programming import policy_iteration, value_iteration, value_iteration_result
import pdb

In [42]:
@dataclass(frozen=True)
class InventoryState:
    on_hand_1: int
    on_order_1: int
    on_hand_2: int
    on_order_2: int

    def inventory_position_1(self) -> int:
        return self.on_hand_1 + self.on_order_1

    def inventory_position_2(self) -> int:
        return self.on_hand_2 + self.on_order_2


InvOrderMapping = Mapping[
    InventoryState,
    Mapping[Tuple[int, int, int], Categorical[Tuple[InventoryState, float]]]
]


class InventoryControl(FiniteMarkovDecisionProcess[InventoryState, Tuple[int, int, int]]):

    def __init__(
        self,
        c1: int,
        c2: int,
        lambda1: float,
        lambda2: float,
        h1: float,
        h2: float,
        c_stockout1: float,
        c_stockout2: float,
        c_transport1: float,
        c_transport2: float
    ):
        self.c1: int = c1
        self.lambda1: float = lambda1
        self.h1: float = h1
        self.c_stockout1: float = c_stockout1
        self.c_transport1: float = c_transport1
        self.demand_distribution_1 = poisson(lambda1)
        self.c2: int = c2
        self.lambda2: float = lambda2
        self.h2: float = h2
        self.c_stockout2: float = c_stockout2
        self.c_transport2: float = c_transport2
        self.demand_distribution_2 = poisson(lambda2)
        super().__init__(self.get_action_transition_reward_map())

    def get_action_transition_reward_map(self) -> InvOrderMapping:
        d: Dict[InventoryState, Dict[Tuple[int, int, int], Categorical[Tuple[InventoryState, float]]]] = {}

        # for both stores...
        for alpha_1 in range(self.c1 + 1):
            for beta_1 in range(self.c1 + 1 - alpha_1):
                for alpha_2 in range(self.c2 + 1):
                    for beta_2 in range(self.c2 + 1 - alpha_2):
                        state: InventoryState = InventoryState(on_hand_1=alpha_1, on_order_1=beta_1, on_hand_2=alpha_2, on_order_2=beta_2)
                        # each stores inventory
                        ip_1: int = state.inventory_position_1()
                        ip_2: int = state.inventory_position_2()
                        base_reward: float = - self.h1 * alpha_1 - self.h2 * alpha_2
                        d1: Dict[Tuple[int, int, int], Categorical[Tuple[InventoryState, float]]] = {}

                        # for both of the stores' orders
                        for order_1 in range(self.c1 - ip_1 + 1):
                            for order_2 in range(self.c2 - ip_2 + 1):
                                
                                for transfer in range(
                                    -min(alpha_2, self.c1 - beta_1 - alpha_1),
                                    min(alpha_1, self.c2 - beta_2 - alpha_2) + 1
                                ):
                                    
                                    prob_1: float = 1 - self.demand_distribution_1.cdf(ip_1 - 1)
                                    prob_2: float = 1 - self.demand_distribution_2.cdf(ip_2 - 1)
                                    prob_dict: Dict[Tuple[InventoryState, float], float] = {}

                                    K_1 = 0.
                                    K_2 = 0.
                                    if order_1 == 0 or order_2 ==0:
                                        K_1 = self.c_transport1
                                        
                                    if transfer != 0:
                                        K_2 = self.c_transport2
                                    
                                    K_1 = (self.c_transport1 * min(1, order_1)) + (self.c_transport1 * min(1, order_2))

                                    # for every inventory position of both stores
                                    for i in range(ip_1):
                                        for j in range(ip_2):
                                            if i < (alpha_1 + beta_1) - transfer and j < (transfer + alpha_2 + beta_2 ):
                                                prob_dict[(InventoryState(ip_1 - i, order_1 - transfer, ip_2 - j, order_2 + transfer), float(base_reward - K_1 - K_2))] = self.demand_distribution_1.pmf(i) * self.demand_distribution_2.pmf(j)
                                            
                                            elif j < (transfer + alpha_2 + beta_2):
                                                prob_dict[(InventoryState(0, order_1 - transfer, ip_2 - j, order_2 + transfer), float(base_reward - self.c_stockout1 * (prob_1 * (self.lambda1 - ip_1) + ip_1 * self.demand_distribution_1.pmf(ip_1)) - K_1 - K_2))] = prob_1 * self.demand_distribution_2.pmf(j)
                                            
                                            elif i < (alpha_1 + beta_1 - transfer):
                                                prob_dict[(InventoryState(ip_1 - i, order_1 - transfer, 0, order_2 + transfer), float(base_reward - self.c_stockout2 * (prob_2 * (self.lambda2 - ip_2) + ip_2 * self.demand_distribution_2.pmf(ip_2)) - K_1 - K_2))] = prob_2 * self.demand_distribution_1.pmf(i)

                                    # both stores have inventory less than transfered and ordered
                                    prob_dict[(InventoryState(0, order_1 - transfer, 0, order_2 + transfer), float(base_reward - self.c_stockout1 * (prob_1 * (self.lambda1 - ip_1) + ip_1 * self.demand_distribution_1.pmf(ip_1)) - self.c_stockout2 * (prob_2 * (self.lambda2 - ip_2) + ip_2 * self.demand_distribution_2.pmf(ip_2)) - K_1 - K_2))] = prob_1 * prob_2

                                    d1[(order_1, order_2, transfer)] = Categorical(prob_dict)

                        d[state] = d1
        return d
     

In [43]:
mdp: FiniteMarkovDecisionProcess[InventoryState, Tuple[int, int, int]] =\
    InventoryControl(
        c1 = 4,
        c2 = 3,
        lambda1 = 2,
        lambda2 = 3,
        h1 = 3,
        h2 = 2,
        c_stockout1 = 13,
        c_stockout2 = 10,
        c_transport1 = 6,
        c_transport2 = 3
    )
    
opt_vf_vi, opt_policy_vi = value_iteration_result(mdp, 0.7)

In [49]:
for s, a in opt_policy_vi.action_for.items():
    print((s.on_hand_1, s.on_order_1, a[0])+(s.on_order_1, s.on_order_2, a[1]))

(0, 0, 4, 0, 0, 3)
(0, 0, 4, 0, 1, 2)
(0, 0, 4, 0, 2, 1)
(0, 0, 0, 0, 3, 0)
(0, 0, 0, 0, 0, 0)
(0, 0, 0, 0, 1, 0)
(0, 0, 0, 0, 2, 0)
(0, 0, 0, 0, 0, 0)
(0, 0, 0, 0, 1, 0)
(0, 0, 0, 0, 0, 0)
(0, 1, 3, 1, 0, 3)
(0, 1, 3, 1, 1, 2)
(0, 1, 0, 1, 2, 0)
(0, 1, 0, 1, 3, 0)
(0, 1, 0, 1, 0, 0)
(0, 1, 0, 1, 1, 0)
(0, 1, 0, 1, 2, 0)
(0, 1, 0, 1, 0, 0)
(0, 1, 0, 1, 1, 0)
(0, 1, 0, 1, 0, 0)
(0, 2, 2, 2, 0, 3)
(0, 2, 0, 2, 1, 0)
(0, 2, 0, 2, 2, 0)
(0, 2, 0, 2, 3, 0)
(0, 2, 0, 2, 0, 0)
(0, 2, 0, 2, 1, 0)
(0, 2, 0, 2, 2, 0)
(0, 2, 0, 2, 0, 0)
(0, 2, 0, 2, 1, 0)
(0, 2, 0, 2, 0, 0)
(0, 3, 0, 3, 0, 0)
(0, 3, 0, 3, 1, 0)
(0, 3, 0, 3, 2, 0)
(0, 3, 0, 3, 3, 0)
(0, 3, 0, 3, 0, 0)
(0, 3, 0, 3, 1, 0)
(0, 3, 0, 3, 2, 0)
(0, 3, 0, 3, 0, 0)
(0, 3, 0, 3, 1, 0)
(0, 3, 0, 3, 0, 0)
(0, 4, 0, 4, 0, 0)
(0, 4, 0, 4, 1, 0)
(0, 4, 0, 4, 2, 0)
(0, 4, 0, 4, 3, 0)
(0, 4, 0, 4, 0, 0)
(0, 4, 0, 4, 1, 0)
(0, 4, 0, 4, 2, 0)
(0, 4, 0, 4, 0, 0)
(0, 4, 0, 4, 1, 0)
(0, 4, 0, 4, 0, 0)
(1, 0, 0, 0, 0, 0)
(1, 0, 0, 0, 1, 0)
(1, 0, 0, 0,

This optimal policy makes intuative sense since when both stores are empty the best action is to stock both stores, and then transfer inventory if needed.