In [3]:
import numpy as np

# --- 1. ENVIRONMENT ---
class GridWorld:
    def __init__(self):
        self.grid_size = 4
        self.terminal_states = [0, 15]
        self.actions = ['UP', 'DOWN', 'LEFT', 'RIGHT']

    def step(self, state, action):
        if state in self.terminal_states:
            return state, 0, True

        row, col = divmod(state, self.grid_size)

        if action == 'UP':    row = max(row - 1, 0)
        elif action == 'DOWN':  row = min(row + 1, self.grid_size - 1)
        elif action == 'LEFT':  col = max(col - 1, 0)
        elif action == 'RIGHT': col = min(col + 1, self.grid_size - 1)

        next_state = row * self.grid_size + col
        reward = -1
        done = next_state in self.terminal_states
        return next_state, reward, done

    def reset(self):
        start_state = np.random.randint(0, 16)
        while start_state in self.terminal_states:
            start_state = np.random.randint(0, 16)
        return start_state

# --- 2. THE ALGORITHM: TD(Lambda) ---
def td_lambda_learning(env, lam=0.5, num_episodes=5000, alpha=0.1, gamma=1.0):
    """
    TD(Lambda) uses 'Eligibility Traces' (E) to update past states.
    lam (lambda): 0 = TD(0), 1 = Monte Carlo.
    """
    # Initialize V(s)
    V = np.zeros(env.grid_size * env.grid_size)

    print(f"Running TD({lam}) Learning...")

    for episode in range(num_episodes):
        # Reset Eligibility Traces at start of every episode
        E = np.zeros(env.grid_size * env.grid_size)

        state = env.reset()
        done = False

        while not done:
            # Policy: Random Walk
            action = np.random.choice(env.actions)
            next_state, reward, done = env.step(state, action)

            # 1. Calculate TD Error (delta)
            # The difference between what we expect and what we got
            target = reward + (0 if done else gamma * V[next_state])
            delta = target - V[state]

            # 2. Increment Trace for CURRENT state
            # "I was just here, so I deserve credit/blame for this reward"
            E[state] += 1

            # 3. Update V for ALL states based on their Trace
            # V(s) = V(s) + alpha * delta * E(s)
            # (Vectorized update for efficiency)
            V += alpha * delta * E

            # 4. Decay Traces for ALL states
            # Memories fade over time: E = gamma * lambda * E
            E *= gamma * lam

            state = next_state

    return V

# --- 3. EXECUTION ---
if __name__ == "__main__":
    env = GridWorld()

    # Run TD(Lambda) with lambda = 0.5
    v_values = td_lambda_learning(env, lam=0.5)

    print(f"\nState-Value Function V(s) via TD(0.5):")
    print("-" * 30)
    print(np.round(v_values.reshape(4, 4), 2))

Running TD(0.5) Learning...

State-Value Function V(s) via TD(0.5):
------------------------------
[[  0.   -12.85 -18.76 -20.62]
 [-12.41 -15.71 -19.13 -19.47]
 [-17.95 -19.31 -17.2  -13.9 ]
 [-20.94 -19.75 -13.09   0.  ]]


### Explanation of the TD(Lambda) Code

This code implements the TD(Lambda) reinforcement learning algorithm to estimate the state-value function for a simple 4x4 grid world environment. Let's break it down into its three main parts:

1.  **GridWorld Environment:** This `GridWorld` class defines a 4x4 grid. States are numbered 0 to 15. It has two terminal states (0 and 15) and four possible actions (UP, DOWN, LEFT, RIGHT). The `step` method takes a state and an action, returning the next state, a reward of -1 for each step (encouraging shorter paths), and whether the episode is done. The `reset` method places the agent in a random non-terminal starting state.

2.  **TD(Lambda) Algorithm:** The `td_lambda_learning` function is the core of the reinforcement learning agent. It aims to learn the value `V(s)` for each state `s`, representing the expected future reward from that state. It initializes a value function `V` (all zeros) and then iterates over many episodes:
    *   **Eligibility Traces (`E`):** At the beginning of each episode, eligibility traces are reset. These traces keep a memory of recently visited states, giving them credit or blame for rewards received later.
    *   **Random Walk Policy:** The agent explores the environment by choosing actions randomly.
    *   **TD Error Calculation:** For each step, it calculates the TD error (`delta`), which is the difference between the expected return from the current state and the actual observed return (reward + discounted value of the next state).
    *   **Updating Traces:** The trace for the current state is incremented.
    *   **Value Function Update:** The value function `V` for all states is updated based on their eligibility trace and the TD error. States with higher eligibility traces (meaning they were visited more recently or frequently) are updated more significantly.
    *   **Decaying Traces:** Eligibility traces for all states are decayed over time, reflecting that older memories fade. The `lam` (lambda) parameter controls this decay, effectively blending between TD(0) (immediate reward consideration, `lam=0`) and Monte Carlo (full episode consideration, `lam=1`).

3.  **Execution:** This section creates an instance of the `GridWorld` environment and then calls the `td_lambda_learning` function with a lambda value of 0.5. Finally, it prints the learned state-value function `V(s)` for all states, reshaped into a 4x4 grid for easy visualization.