# Temporal Difference Learning
- No necesita terminar el episodio
- Free model
- tiene bias pero mas varianza
- Depende fuertemente de los valores iniciales por el bias

In [1]:
import gridworld_mdp as gw   # defines the MDP for a 4x4 gridworld
import numpy as np

In [2]:
n_states = gw.get_state_count()
print(n_states)

15


In [21]:
def play_episode_lazy_programmer(gw, policy):
    state = np.random.randint(gw.get_state_count())
    states_and_rewards = [(state, 0)]
    G = 0
    while True:
        actions = gw.get_available_actions(state)
        action = actions[np.random.choice(len(actions), p=policy)]
        transitions = gw.get_transitions(state=state, action=action)
        trans_probs = []
        for _, _, probabiliity in transitions:
            trans_probs.append(probabiliity)
        state, reward, _ = transitions[np.random.choice(len(trans_probs), p=trans_probs)]
        states_and_rewards.append((state, reward))
        if state == 0:
            break
    return states_and_rewards

In [26]:
def policy_evel_TD0_lazy_programmer(gw, policy, gamma, alpha = 1, episodes=1000):
    np.random.seed(seed)
    V = np.random.random(n_states)
    for it in range(episodes):
        states_and_rewards = play_episode_lazy_programmer(gw, policy)
        for t in range(len(states_and_rewards)-1):
            s, _  = states_and_rewards[t]
            s2, r = states_and_rewards[t+1]
            V[s] = V[s] + alpha*(r  +  gamma * V[s2] - V[s])
    return V

In [27]:
np.random.seed(seed)
policy = [0.25, 0.25, 0.25, 0.25]
gamma = 0.9
policy_evel_TD0_lazy_programmer(gw, policy, gamma, alpha = 0.05, episodes=10000)

array([ 0.00868931, -5.74312299, -7.03889983, -7.71758402, -5.05904968,
       -6.72712856, -7.22716084, -7.40129112, -7.25137599, -7.31717797,
       -6.87068725, -6.20788345, -7.7570374 , -7.21293701, -5.10723724])

In [28]:
def play_episode(V, gw, policy, gamma, alpha = 1):
    state = np.random.randint(n_states)
    G = 0
    while True:
        actions = gw.get_available_actions(state)
        action = actions[np.random.choice(len(actions), p=policy)]
        transitions = gw.get_transitions(state=state, action=action)
        trans_probs = []
        for _, _, probabiliity in transitions:
            trans_probs.append(probabiliity)
        next_state, reward, _ = transitions[np.random.choice(len(trans_probs), p=trans_probs)]
        V[state] = V[state] + alpha*(reward  +  gamma * V[next_state] - V[state]) # error = reward  +  gamma * V[next_state] - V[s]
        state = next_state
        if state == 0:
            break
    return V

In [29]:
def policy_evel_TD0(gw, policy, gamma, alpha = 1, episodes=1000):
    V = np.random.random(n_states)
    for i in range(episodes):
        V = play_episode(V, gw, policy, gamma, alpha)
    return V

In [30]:
np.random.seed(seed)
policy = [0.25, 0.25, 0.25, 0.25]
gamma = 0.9
policy_evel_TD0(gw, policy, gamma, alpha = 0.05, episodes=10000)

array([ 0.00868931, -5.74312299, -7.03889983, -7.71758402, -5.05904968,
       -6.72712856, -7.22716084, -7.40129112, -7.25137599, -7.31717797,
       -6.87068725, -6.20788345, -7.7570374 , -7.21293701, -5.10723724])

array([ 0.00868931, -5.74312299, -7.03889983, -7.71758402, -5.05904968,
       -6.72712856, -7.22716084, -7.40129112, -7.25137599, -7.31717797,
       -6.87068725, -6.20788345, -7.7570374 , -7.21293701, -5.10723724])