In [1]:
trajectories = [
    [('uni', 'take_exam', 1)],
    [('uni', 'take_exam', 1)],
    [('uni', 'take_exam', 0)],
    [('uni', 'take_exam', 1)],
    [('uni', 'take_exam', 1)],
    [('uni', 'take_exam', 1)],
    [('uni', 'take_exam', 1)],
    [('home', 'travel', 0), ('uni', 'take_exam', 0)]
]

1. What would be $v_\pi(uni)$ and $v_\pi(home)$ computed using MC?

In [2]:
def mc(trajectories):
    sums = {'uni': 0, 'home': 0}
    counts = {'uni': 0, 'home': 0}

    for trajectory in trajectories:
        for state, action, reward in trajectory:
            sums[state] += reward
            counts[state] += 1

    v = {state: sums[state] / counts[state] if counts[state] > 0 else 0 for state in sums}

    return v

In [3]:
mc(trajectories)

{'uni': 0.75, 'home': 0.0}

2. What would be $v_\pi(uni)$ and $v_\pi(home)$ computed using TD?

In [4]:
def td(trajectories, alpha=0.1, gamma=1):
    v = {'uni': 0, 'home': 0}

    for trajectory in trajectories:
        for i, (state, action, reward) in enumerate(trajectory):
            next_state = trajectory[i + 1][0] if i + 1 < len(trajectory) else None
            # apply TD(0) update
            td_target = reward + (gamma * v[next_state] if next_state else 0)
            # update value estimate
            v[state] += alpha * (td_target - v[state])

    return v

In [5]:
td(trajectories)

{'uni': 0.41048379, 'home': 0.04560931000000001}

11. Using the model $\hat{\mathcal{M}}$ that you have inferred, what would be $v_\pi(uni)$ and $v_\pi(home)$ computed using DP?

In [6]:
# assuming deterministic policy that chooses 'travel' at 'home' and 'take_exam' at 'uni',
# and that 'take_exam' ends the episode, we have the following model:
# For 'home': action 'travel' leads to 'uni' with reward 0 and transition probability 1
# For 'uni': action 'take_exam' has a reward of 0.75, and it's a terminal state
rewards = {'home': {'travel': 0}, 'uni': {'take_exam': 0.75}}
transitions = {'home': {'travel': 'uni'}, 'uni': {'take_exam': 'uni'}}
gamma = 1

v_dp = {'uni': 0, 'home': 0}

# we know that policy is deterministic as well as transition function,
# we can compute the value of each state directly without the need for iteration:
v_dp['uni'] = rewards['uni']['take_exam']  # terminal state after 'take_exam'
v_dp['home'] = rewards['home']['travel'] + gamma * v_dp[transitions['home']['travel']]

In [7]:
v_dp

{'uni': 0.75, 'home': 0.75}

In [8]:
new_trajectories = [
    [('uni', 'take_exam', 1)],
    [('uni', 'take_exam', 1)],
    [('home', 'travel', 0), ('uni', 'take_exam', 1)],
    [('uni', 'take_exam', 1)],
    [('uni', 'take_exam', 0)],
    [('uni', 'take_exam', 1)],
    [('home', 'travel', 0), ('uni', 'take_exam', 1)],
    [('home', 'travel', 0), ('uni', 'take_exam', 0)],
]

13. Compute again $v_\pi(uni)$ and $v_\pi(home)$ using MC from real and simulated data.

In [9]:
mc(trajectories + new_trajectories)

{'uni': 0.75, 'home': 0.0}

14. Compute again $v_\pi(uni)$ and $v_\pi(home)$ using TD from real and simulated data.

In [10]:
td(trajectories + new_trajectories)

{'uni': 0.5733326018315259, 'home': 0.19297827330335024}