In [3]:
from rl.monte_carlo import mc_control
from typing import Iterable, Iterator, Tuple, TypeVar, Callable, Optional, Dict,Mapping

from rl.distribution import Distribution
from rl.function_approx import FunctionApprox
import rl.markov_process as mp
import rl.markov_decision_process as markov_decision_process
from rl.markov_decision_process import (MarkovDecisionProcess)
from rl.returns import returns
from rl.markov_decision_process import FinitePolicy, TransitionStep
from rl.distribution import (Bernoulli, Constant, Categorical, Choose,
                             Distribution, FiniteDistribution)
from IPython.display import clear_output
from pprint import pprint
import numpy as np
import math

## Import simple_inventory_dmp_cap as an example

In [4]:
from rl.chapter3.simple_inventory_mdp_cap import SimpleInventoryMDPCap, InventoryState
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.markov_decision_process import FinitePolicy, StateActionMapping
from rl.markov_process import FiniteMarkovProcess, FiniteMarkovRewardProcess
from rl.distribution import Categorical, Constant
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0
user_gamma = 0.9


si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
       SimpleInventoryMDPCap(
           capacity=user_capacity,
           poisson_lambda=user_poisson_lambda,
           holding_cost=user_holding_cost,
           stockout_cost=user_stockout_cost
       )


from rl.dynamic_programming import value_iteration_result

print("MDP Value Iteration Optimal Value Function and Optimal Policy")
print("--------------")
opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma)
pprint(opt_vf_vi)
print(opt_policy_vi)
print()

MDP Value Iteration Optimal Value Function and Optimal Policy
--------------
{InventoryState(on_hand=1, on_order=0): -28.660950216301437,
 InventoryState(on_hand=0, on_order=1): -27.66095021630144,
 InventoryState(on_hand=0, on_order=2): -27.991890076067463,
 InventoryState(on_hand=0, on_order=0): -34.89484576629397,
 InventoryState(on_hand=1, on_order=1): -28.991890076067467,
 InventoryState(on_hand=2, on_order=0): -29.991890076067463}
For State InventoryState(on_hand=0, on_order=0):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=1):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=2):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=1, on_order=0):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=1, on_order=1):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=2, on_order=0):
  Do Action 0 with Probability 1.000




## Implement tabular SARSA with the function SARSA_tabular_control

In [7]:
S = TypeVar('S')
A = TypeVar('A')

# Define a helper function to get policy from Q
def greedy_epsilon_policy(
        q: Mapping[S, Mapping[A,float]],
        epsilon: float
)->FinitePolicy[S,A]:
    #follow the structure of rl.markov_decision_process.policy_from_q, but restore policy in a map
    policy_map:Mapping[S, Optional[FiniteDistribution[A]]] = {}

    for state in q.keys():
        actions = q[state].keys()
        max_action = max(q[state], key=q[state].get)
        d = {action:epsilon/len(actions) for action in actions}
        d[max_action] += 1-epsilon
        policy_map[state] = Categorical(d)

    return FinitePolicy(policy_map)



def Qlearning_tabular(
    simulator: Callable[[S,A], TransitionStep[S, A]], # given distribution of initial state,policy,return a trace iterator
    state_distribution: Distribution[S],
    gamma: float,
    initial_q: Mapping[S, Mapping[A,float]],
    learning_rate_func: Callable[[int], float],
    tolerance: float = 1e-6,
    nstop: int = None


):
    #initialize q and p. In this case assume no terminal states
    q = initial_q
    #p = initial_p
    count = {}           # record the number of appearance of (action,state) pair
    trace_count = 0      # record the number of traces

    max_steps = round(math.log(tolerance) / math.log(gamma)) if gamma < 1 else nstop

    while True:    # for each  episode
        trace_count += 1
        epsilon = 1/trace_count

        state = state_distribution.sample()      # initialize S

        step_count = 0
        while step_count < max_steps:
            step_count += 1

            
            p = greedy_epsilon_policy(q,epsilon)     # Choose A from S using policy from Q
            action = p.act(state).sample()
            count[(state,action)] = count.get((state,action),0.)+1

            next_state,reward = simulator(state,action)   # get next state and reward
            p = greedy_epsilon_policy(q,epsilon)

            #update Q
            q[state][action] += learning_rate_func(count[state,action])*(reward + gamma*max(q[next_state].values())-q[state][action])
            state = next_state

        yield q,p

## Simulate simple inventory with SARSA

In [8]:
from rl.function_approx import learning_rate_schedule
initial_learning_rate: float = 0.03
half_life: float = 1000.0
exponent: float = 0.5
learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent)

def simulator(state:S,action:A)->Tuple[S,A]:
    return si_mdp.step(state,action).sample()

states = si_mdp.states()
state_distribution =Categorical({state:1/len(states) for state in states})
initial_q = {state:{action: -3.0 for action in si_mdp.actions(state)} for state in states}

x = Qlearning_tabular(
    simulator=simulator, # given distribution of initial state,policy,return a trace iterator
    state_distribution=state_distribution,
    gamma=0.9,
    initial_q=initial_q,
    learning_rate_func = learning_rate_func,
)

import itertools
*_,(q_final,p_final) = itertools.islice(x,10000)
V = {}
for state in q_final.keys():
    for action in q_final[state]:
        V[state] = V.get(state,0.0) + p_final.act(state).probability(action)*q_final[state][action]

pprint(V)
print(p_final)

{InventoryState(on_hand=1, on_order=0): -28.61970258219844,
 InventoryState(on_hand=0, on_order=1): -27.63213966504941,
 InventoryState(on_hand=0, on_order=2): -27.8579055432372,
 InventoryState(on_hand=0, on_order=0): -34.91922886200373,
 InventoryState(on_hand=1, on_order=1): -28.92941518317391,
 InventoryState(on_hand=2, on_order=0): -29.96029094945271}
For State InventoryState(on_hand=0, on_order=0):
  Do Action 0 with Probability 0.000
  Do Action 1 with Probability 1.000
  Do Action 2 with Probability 0.000
For State InventoryState(on_hand=0, on_order=1):
  Do Action 0 with Probability 0.000
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=2):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=1, on_order=0):
  Do Action 0 with Probability 0.000
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=1, on_order=1):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=2, on_order=0):
  Do Action 0 

We see very good agreement with DP here for all kind of initialization.