In [23]:
import itertools
from rl.returns import returns
import rl.markov_process as mp
from typing import Iterable, Iterator, Tuple, TypeVar, Dict, Callable
from rl.distribution import Choose
from IPython.display import clear_output
import matplotlib.pyplot as plt


## Write a function mc_tabular for Tabular TD Prediction.

In [24]:
S = TypeVar('S')
A = TypeVar('A')
def td_tabular(experiences: Iterable[mp.TransitionStep[S]],
        gamma: float,
        learning_rate: Callable[[int],float],
        )->Dict[S, Iterable[float]]:

    values_function = {}
    count = {}
    temp = 0
    for step in experiences:
        temp += 1

        if temp%10000 == 0:
            print(temp/100)
            clear_output(wait=True)

        count[step.state] = count.get(step.state,0) + 1
        alpha = learning_rate(count.get(step.state,0))
        values_function[step.state] =values_function.get(step.state,0) + alpha*(step.reward + gamma*values_function.get(step.next_state,0) - values_function.get(step.state,0))



    return values_function


## Import Simple Inventory MRP(code from text book).

In [25]:
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0
user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite( capacity=user_capacity,
                                   poisson_lambda=user_poisson_lambda,
                                   holding_cost=user_holding_cost,
                                   stockout_cost=user_stockout_cost
)
si_mrp.display_value_function(gamma=user_gamma)

{InventoryState(on_hand=1, on_order=0): -28.932,
 InventoryState(on_hand=0, on_order=0): -35.511,
 InventoryState(on_hand=0, on_order=1): -27.932,
 InventoryState(on_hand=0, on_order=2): -28.345,
 InventoryState(on_hand=1, on_order=1): -29.345,
 InventoryState(on_hand=2, on_order=0): -30.345}


## Define learning rate and atomic experiences as in the textbook

In [26]:
from rl.function_approx import learning_rate_schedule
from rl.chapter10.prediction_utils import fmrp_episodes_stream
from rl.chapter10.prediction_utils import unit_experiences_from_episodes

episode_length: int = 100
num_episode: int = 100000
initial_learning_rate: float = 0.03
half_life: float = 1000.0
exponent: float = 0.5
gamma: float = 0.9
episodes: Iterable[Iterable[mp.TransitionStep[S]]] = fmrp_episodes_stream(si_mrp)
td_experiences: Iterable[mp.TransitionStep[S]] = unit_experiences_from_episodes( episodes,
episode_length )
learning_rate_func: Callable[[int], float] = learning_rate_schedule( initial_learning_rate=initial_learning_rate, half_life=half_life, exponent=exponent)

## use td_tabular to estimate the value function

In [27]:
experiences = itertools.islice(td_experiences,num_episode*episode_length)

value_functions = td_tabular(experiences = experiences,
        gamma = gamma,
        learning_rate = learning_rate_func
        )

100000.0


## print value functions to compare with DP

In [30]:
for state in value_functions.keys():
    print(str(state)+": "+str(value_functions[state]))

InventoryState(on_hand=0, on_order=1): -27.975494777338298
InventoryState(on_hand=1, on_order=1): -29.365020514735743
InventoryState(on_hand=1, on_order=0): -28.94621131687509
InventoryState(on_hand=2, on_order=0): -30.388594059800837
InventoryState(on_hand=0, on_order=0): -35.42135562965067
InventoryState(on_hand=0, on_order=2): -28.22541310935279
