### 3)

In [1]:
from dataclasses import dataclass
import numpy as np
import os
import pprint
import sys
sys.path.append(os.path.abspath("/Users/justincramer/Documents/Coding/CME241/RL-book/"))

from typing import Iterable, Iterator, TypeVar, Callable, Mapping
from rl.distribution import Categorical, Choose
from rl.iterate import converge, converged, last
from rl.markov_decision_process import MarkovDecisionProcess, Policy, \
    TransitionStep, NonTerminal
import rl.markov_process as mp
from rl.returns import returns
import itertools

S = TypeVar('S')

#### a)

In [2]:
def tabular_mc_prediction(
    traces: Iterable[Iterable[mp.TransitionStep[S]]],
    γ: float,
    episode_length_tolerance: float = 1e-6
) -> Iterator[Mapping[S, float]]:
    
    counts: Mapping[S, int] = dict() # State VF update frequency
    vf: Mapping[S, float] = dict() # State VF approximation
    
    episodes: Iterator[Iterator[mp.ReturnStep[S]]] = \
        (returns(trace, γ, episode_length_tolerance) for trace in traces)
    
    yield vf
    for episode in episodes:
        for step in episode:
            counts[step.state] = counts.get(step.state, 0) + 1
            alpha = 1 / counts[step.state]
            vf[step.state] = (1 - alpha) * vf.get(step.state, 0) + alpha * step.return_
            yield vf

#### b)

In [3]:
def tabular_td_prediction(
    traces: Iterable[Iterable[mp.TransitionStep[S]]],
    γ: float,
    α: float
) -> Iterator[Mapping[S, float]]:
    
    vf: Mapping[S, float] = dict() # State VF approximation
        
    yield vf
    for trace in traces:
        for step in trace:
            vf[step.state] = (1 - α) * vf.get(step.state, 0) \
                             + α * (step.reward + γ * vf.get(step.next_state, 0))
            yield vf

#### c)

In [4]:
from rl.chapter2.simple_inventory_mrp import SimpleInventoryMRPFinite

In [5]:
# Value function from SimpleInventoryMRPFinite
user_capacity = 2
user_poisson_lambda = 1.0
user_holding_cost = 1.0
user_stockout_cost = 10.0

user_gamma = 0.9

si_mrp = SimpleInventoryMRPFinite(
    capacity=user_capacity,
    poisson_lambda=user_poisson_lambda,
    holding_cost=user_holding_cost,
    stockout_cost=user_stockout_cost
)

si_mrp.display_value_function(gamma=user_gamma)


{NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.932,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.345,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.345,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.932,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.511,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.345}


In [6]:
init_distribution: Choose[S] = Choose(si_mrp.non_terminal_states)
traces: Iterable[Iterable[mp.TransitionStep[S]]] = si_mrp.reward_traces(init_distribution)
predictions_mc = tabular_mc_prediction(traces, γ=user_gamma)
predictions_td = tabular_td_prediction(traces, γ=user_gamma, α=0.05)

pprint.pprint(last(itertools.islice(predictions_mc, 10000)))
print('\n')
pprint.pprint(last(itertools.islice(predictions_td, 10000)))

{NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -28.43199798242442,
 NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -29.322410461400935,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -28.878713387063662,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -27.69415256092603,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -30.089227769958033,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -35.584176643914475}


{NonTerminal(state=InventoryState(on_hand=1, on_order=1)): -28.421054204591513,
 NonTerminal(state=InventoryState(on_hand=1, on_order=0)): -27.84323742506352,
 NonTerminal(state=InventoryState(on_hand=0, on_order=1)): -26.942646761659827,
 NonTerminal(state=InventoryState(on_hand=2, on_order=0)): -29.807018164280485,
 NonTerminal(state=InventoryState(on_hand=0, on_order=0)): -34.86890595644093,
 NonTerminal(state=InventoryState(on_hand=0, on_order=2)): -27.68952037567707}
