In [1]:
import numpy as np
from random import choices
from typing import Sequence, Tuple, Mapping

S = str
DataType = Sequence[Sequence[Tuple[S, float]]]
ProbFunc = Mapping[S, Mapping[S, float]]
RewardFunc = Mapping[S, float]
ValueFunc = Mapping[S, float]


def get_state_return_samples(data: DataType) -> Sequence[Tuple[S, float]]:
    """
    prepare sequence of (state, return) pairs.
    Note: (state, return) pairs is not same as (state, reward) pairs.
    """
    return [(s, sum(r for (_, r) in l[i:])) for l in data for i, (s, _) in enumerate(l)]


def get_mc_value_function(state_return_samples: Sequence[Tuple[S, float]]) -> ValueFunc:
    """
    Implement tabular MC Value Function compatible with the interface defined above.
    """
    sr_dict = {}

    for (state, ret) in state_return_samples:
        if state not in sr_dict.keys():
            sr_dict[state] = [ret]
        else:
            sr_dict[state].append(ret)
    return {state: np.mean(sr_dict[state]) for state in sr_dict.keys()}


def get_state_reward_next_state_samples(data: DataType) -> Sequence[Tuple[S, float, S]]:
    """
    prepare sequence of (state, reward, next_state) triples.
    """
    return [
        (s, r, l[i + 1][0] if i < len(l) - 1 else "T")
        for l in data
        for i, (s, r) in enumerate(l)
    ]


def get_probability_and_reward_functions(
    srs_samples: Sequence[Tuple[S, float, S]]
) -> Tuple[ProbFunc, RewardFunc]:
    """
    Implement code that produces the probability transitions and the
    reward function compatible with the interface defined above.
    """
    p_dict = {}
    r_dict = {}
    for (init_state, reward, next_state) in srs_samples:
        if init_state not in p_dict.keys():
            p_dict[init_state] = {next_state: 1}
        elif next_state not in p_dict[init_state].keys():
            p_dict[init_state][next_state] = 1
        else:
            p_dict[init_state][next_state] += 1

        if init_state not in r_dict.keys():
            r_dict[init_state] = [reward]
        else:
            r_dict[init_state].append(reward)

    for init_state in p_dict.keys():
        r_dict[init_state] = np.mean(r_dict[init_state])

    return (p_dict, r_dict)


def get_mrp_value_function(prob_func: ProbFunc, reward_func: RewardFunc) -> ValueFunc:
    """
    Implement code that calculates the MRP Value Function from the probability
    transitions and reward function, compatible with the interface defined above.
    Hint: Use the MRP Bellman Equation and simple linear algebra
    """
    nonterminal_states = [key for key in prob_func.keys()]
    tots = [
        np.sum(
            [prob_func[init_state][next_state] for next_state in prob_func[init_state]]
        )
        for init_state in nonterminal_states
    ]
    nnt = len(nonterminal_states)

    P_mat = np.zeros((nnt, nnt))
    R_vec = np.zeros(nnt)
    row_idx = 0

    for init_state in nonterminal_states:
        R_vec[row_idx] = reward_func[init_state]
        col_idx = 0
        for next_state in nonterminal_states:
            P_mat[row_idx][col_idx] = prob_func[init_state][next_state]
            col_idx += 1
        row_idx += 1
    P_mat = P_mat.T
    P_mat /= tots
    lhs = np.eye(nnt) - P_mat.T
    v_vec = np.linalg.solve(lhs, R_vec)
    idx = 0
    vf = {}
    for state in nonterminal_states:
        vf[state] = v_vec[idx]
        idx += 1
    return vf


def get_td_value_function(
    srs_samples: Sequence[Tuple[S, float, S]],
    num_updates: int = 300000,
    learning_rate: float = 0.03,
    learning_rate_decay: int = 30,
) -> ValueFunc:
    """
    Implement tabular TD(0) (with experience replay) Value Function compatible
    with the interface defined above. Let the step size (alpha) be:
    learning_rate * (updates / learning_rate_decay + 1) ** -0.5
    so that Robbins-Monro condition is satisfied for the sequence of step sizes.
    """
    vf = {}
    for srs in srs_samples:
        vf[srs[0]] = 0.0
        vf[srs[2]] = 0.0

    update_num = 1

    def evaluate_step_size(update_num):
        return learning_rate * ((update_num - 1) / learning_rate_decay + 1) ** -0.5

    for (init_state, reward, next_state) in choices(srs_samples, k=num_updates):
        vf[init_state] += evaluate_step_size(update_num) * (
            reward + vf[next_state] - vf[init_state]
        )
        update_num += 1

    return vf


def get_lstd_value_function(srs_samples: Sequence[Tuple[S, float, S]]) -> ValueFunc:
    """
    Implement LSTD Value Function compatible with the interface defined above.
    Hint: Tabular is a special case of linear function approx where each feature
    is an indicator variables for a corresponding state and each parameter is
    the value function for the corresponding state.
    """
    
    vf = {}
    for srs in srs_samples:
        vf[srs[0]] = 0.0
        vf[srs[2]] = 0.0

    nnt = len(vf.keys())

    A = 0.001 * np.eye(nnt)
    K = 100_000

    phi = {}
    for idx, state in enumerate(vf.keys()):
        phi[state] = np.eye(nnt)[:,idx]

    b = np.zeros(nnt)
    for srs in choices(srs_samples, k=K):
        init_state = srs[0]
        r = srs[1]
        next_state = srs[2]
        A += np.outer(phi[init_state], (phi[init_state] - phi[next_state]).T)
        b += phi[init_state] * r
    A /= K
    b /= K
    w = np.linalg.solve(A, b)
    for idx, state in enumerate(vf.keys()):
        vf[state] = w[idx]
    return vf


In [2]:
given_data: DataType = [
    [('A', 2.), ('A', 6.), ('B', 1.), ('B', 2.)],
    [('A', 3.), ('B', 2.), ('A', 4.), ('B', 2.), ('B', 0.)],
    [('B', 3.), ('B', 6.), ('A', 1.), ('B', 1.)],
    [('A', 0.), ('B', 2.), ('A', 4.), ('B', 4.), ('B', 2.), ('B', 3.)],
    [('B', 8.), ('B', 2.)]
]

sr_samps = get_state_return_samples(given_data)



srs_samps = get_state_reward_next_state_samples(given_data)

pfunc, rfunc = get_probability_and_reward_functions(srs_samps)

In [3]:
print("------------- MONTE CARLO VALUE FUNCTION --------------")
print(get_mc_value_function(sr_samps))

print("-------------- MRP VALUE FUNCTION ----------")
print(get_mrp_value_function(pfunc, rfunc))

print("------------- TD VALUE FUNCTION --------------")
print(get_td_value_function(srs_samps))

print("------------- LSTD VALUE FUNCTION --------------")
print(get_lstd_value_function(srs_samps))

------------- MONTE CARLO VALUE FUNCTION --------------
{'A': 9.571428571428571, 'B': 5.642857142857143}
-------------- MRP VALUE FUNCTION ----------
{'A': 12.933333333333332, 'B': 9.6}
------------- TD VALUE FUNCTION --------------
{'A': 12.848679979954118, 'B': 9.610724118089085, 'T': 0.0}
------------- LSTD VALUE FUNCTION --------------
{'A': 12.894341269082576, 'B': 9.566396824829738, 'T': 0.0}
