In [5]:
from rl.monte_carlo import mc_control
from typing import Iterable, Iterator, Tuple, TypeVar, Callable, Optional, Dict,Mapping

from rl.distribution import Distribution
from rl.function_approx import FunctionApprox
import rl.markov_process as mp
import rl.markov_decision_process as markov_decision_process
from rl.markov_decision_process import (MarkovDecisionProcess)
from rl.returns import returns
from rl.markov_decision_process import FinitePolicy, TransitionStep
from rl.distribution import (Bernoulli, Constant, Categorical, Choose,
                             Distribution, FiniteDistribution)
from IPython.display import clear_output
from pprint import pprint
import numpy as np

## Import simple_inventory_dmp_cap as an example

In [6]:
from rl.chapter3.simple_inventory_mdp_cap import SimpleInventoryMDPCap, InventoryState
from rl.markov_decision_process import FiniteMarkovDecisionProcess
from rl.markov_decision_process import FinitePolicy, StateActionMapping
from rl.markov_process import FiniteMarkovProcess, FiniteMarkovRewardProcess
from rl.distribution import Categorical, Constant
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0
user_gamma = 0.9


si_mdp: FiniteMarkovDecisionProcess[InventoryState, int] =\
       SimpleInventoryMDPCap(
           capacity=user_capacity,
           poisson_lambda=user_poisson_lambda,
           holding_cost=user_holding_cost,
           stockout_cost=user_stockout_cost
       )


from rl.dynamic_programming import value_iteration_result

print("MDP Value Iteration Optimal Value Function and Optimal Policy")
print("--------------")
opt_vf_vi, opt_policy_vi = value_iteration_result(si_mdp, gamma=user_gamma)
pprint(opt_vf_vi)
print(opt_policy_vi)
print()

MDP Value Iteration Optimal Value Function and Optimal Policy
--------------
{InventoryState(on_hand=0, on_order=1): -27.66095021630144,
 InventoryState(on_hand=0, on_order=0): -34.89484576629397,
 InventoryState(on_hand=1, on_order=0): -28.660950216301437,
 InventoryState(on_hand=0, on_order=2): -27.991890076067463,
 InventoryState(on_hand=1, on_order=1): -28.991890076067467,
 InventoryState(on_hand=2, on_order=0): -29.991890076067463}
For State InventoryState(on_hand=0, on_order=0):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=1):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=2):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=1, on_order=0):
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=1, on_order=1):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=2, on_order=0):
  Do Action 0 with Probability 1.000




Implement Tabular MC control with GLIE, $\epsilon = 1/k$.

In [12]:
S = TypeVar('S')
A = TypeVar('A')

# Define a helper function to get policy from Q
def greedy_epsilon_policy(
        q: Mapping[S, Mapping[A,float]],
        epsilon: float
)->FinitePolicy[S,A]:
    #follow the structure of rl.markov_decision_process.policy_from_q, but restore policy in a map
    policy_map:Mapping[S, Optional[FiniteDistribution[A]]] = {}

    for state in q.keys():
        actions = q[state].keys()
        max_action = max(q[state], key=q[state].get)
        d = {action:epsilon/len(actions) for action in actions}
        d[max_action] += 1-epsilon
        policy_map[state] = Categorical(d)

    return FinitePolicy(policy_map)



def mc_tabular_control(
    simulator: Callable[[Distribution[S],FinitePolicy[S,A]], Iterator[TransitionStep[S, A]]], # given distribution of initial state,policy,return a trace iterator
    state_distribution: Distribution[S],
    gamma: float,
    initial_q: Mapping[S, Mapping[A,float]],
    initial_p: FinitePolicy[S,A],
    tolerance: float = 1e-6

):
    #initialize q and p
    q = initial_q
    p = initial_p
    count = {}           # record the number of appearance of (action,state) pair
    trace_count = 0      # record the number of traces
    while True:
        trace_count += 1
        epsilon = 1/trace_count
        trace = simulator(state_distribution, p)
        return_trace = returns(trace,gamma,tolerance)   # calculate return from trace

        for step in return_trace:
            count[(step.state,step.action)] = count.get((step.state,step.action),0.)+1
            #update Q
            q[step.state][step.action] += 1/count[(step.state,step.action)]*(step.return_ - q[step.state][step.action])

        #update policy
        p = greedy_epsilon_policy(q, epsilon)
        yield q,p

Run MC control algorithm on the simple inventory case. Use mdp.simulate_actions as the simulator.

In [10]:
my_simulator = si_mdp.simulate_actions
states = si_mdp.states()
my_state_distribution =Categorical({state:1/len(states) for state in states})
my_gamma = 0.9

# initial q and p value
my_initial_q = {state:{action: -2.0 for action in si_mdp.actions(state)} for state in states}
my_initial_p = FinitePolicy({state:Categorical({action: 1/len(si_mdp.actions(state)) for action in si_mdp.actions(state)})for state in states})

x = mc_tabular_control(
    simulator = my_simulator, # given distribution of initial state,policy,return a trace iterator
    state_distribution = my_state_distribution,
    gamma = my_gamma,
    initial_q = my_initial_q,
    initial_p = my_initial_p

)



import itertools
*_,(q_final,p_final) = itertools.islice(x,5000)
V = {}
for state in q_final.keys():
    for action in q_final[state]:
        V[state] = V.get(state,0.0) + p_final.act(state).probability(action)*q_final[state][action]

pprint(V)
print(p_final)

{InventoryState(on_hand=0, on_order=1): -27.68414929855416,
 InventoryState(on_hand=0, on_order=0): -34.93079434279971,
 InventoryState(on_hand=1, on_order=0): -28.689134875877556,
 InventoryState(on_hand=0, on_order=2): -28.594010741244826,
 InventoryState(on_hand=1, on_order=1): -29.021871164257433,
 InventoryState(on_hand=2, on_order=0): -30.02724869732018}
For State InventoryState(on_hand=0, on_order=0):
  Do Action 0 with Probability 0.000
  Do Action 1 with Probability 1.000
  Do Action 2 with Probability 0.000
For State InventoryState(on_hand=0, on_order=1):
  Do Action 0 with Probability 0.000
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=2):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=1, on_order=0):
  Do Action 0 with Probability 0.000
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=1, on_order=1):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=2, on_order=0):
  Do Actio

Change the intialization of Q

In [13]:
my_simulator = si_mdp.simulate_actions
states = si_mdp.states()
my_state_distribution =Categorical({state:1/len(states) for state in states})
my_gamma = 0.9

# initial q and p value
my_initial_q = {state:{action: -1.0 for action in si_mdp.actions(state)} for state in states}
my_initial_p = FinitePolicy({state:Categorical({action: 1/len(si_mdp.actions(state)) for action in si_mdp.actions(state)})for state in states})

x = mc_tabular_control(
    simulator = my_simulator, # given distribution of initial state,policy,return a trace iterator
    state_distribution = my_state_distribution,
    gamma = my_gamma,
    initial_q = my_initial_q,
    initial_p = my_initial_p

)



import itertools
*_,(q_final,p_final) = itertools.islice(x,5000)
V = {}
for state in q_final.keys():
    for action in q_final[state]:
        V[state] = V.get(state,0.0) + p_final.act(state).probability(action)*q_final[state][action]

pprint(V)
print(p_final)


{InventoryState(on_hand=0, on_order=1): -27.991239485068757,
 InventoryState(on_hand=0, on_order=0): -35.5316042017312,
 InventoryState(on_hand=1, on_order=0): -28.976309379523904,
 InventoryState(on_hand=0, on_order=2): -28.36273126802262,
 InventoryState(on_hand=1, on_order=1): -29.38442287203882,
 InventoryState(on_hand=2, on_order=0): -30.369751798800348}
For State InventoryState(on_hand=0, on_order=0):
  Do Action 0 with Probability 0.000
  Do Action 1 with Probability 0.000
  Do Action 2 with Probability 1.000
For State InventoryState(on_hand=0, on_order=1):
  Do Action 0 with Probability 0.000
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=0, on_order=2):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=1, on_order=0):
  Do Action 0 with Probability 0.000
  Do Action 1 with Probability 1.000
For State InventoryState(on_hand=1, on_order=1):
  Do Action 0 with Probability 1.000
For State InventoryState(on_hand=2, on_order=0):
  Do Action

Change the initialization value will result in different policy. Why?
