In [1]:
import numpy as np

# --- 1. ENVIRONMENT (4x4 Grid World) ---
class GridWorld:
    def __init__(self):
        self.grid_size = 4
        self.terminal_states = [0, 15] # Top-Left, Bottom-Right
        self.actions = ['UP', 'DOWN', 'LEFT', 'RIGHT']

    def step(self, state, action):
        if state in self.terminal_states:
            return state, 0, True

        row, col = divmod(state, self.grid_size)

        if action == 'UP':    row = max(row - 1, 0)
        elif action == 'DOWN':  row = min(row + 1, self.grid_size - 1)
        elif action == 'LEFT':  col = max(col - 1, 0)
        elif action == 'RIGHT': col = min(col + 1, self.grid_size - 1)

        next_state = row * self.grid_size + col
        reward = -1
        done = next_state in self.terminal_states
        return next_state, reward, done

    def reset(self):
        start_state = np.random.randint(0, 16)
        while start_state in self.terminal_states:
            start_state = np.random.randint(0, 16)
        return start_state

# --- 2. THE ALGORITHM: TD(0) Prediction ---
def td_zero_learning(env, num_episodes=5000, alpha=0.1, gamma=1.0):
    """
    TD(0) evaluates the value of states V(s) under a random policy.
    It updates V(s) after every single step.
    """
    # Initialize V(s) to 0
    V = np.zeros(env.grid_size * env.grid_size)

    print(f"Running TD(0) Learning for {num_episodes} episodes...")

    for _ in range(num_episodes):
        state = env.reset()
        done = False

        while not done:
            # Policy: Random Walk (Standard for evaluation)
            action = np.random.choice(env.actions)

            # Take step
            next_state, reward, done = env.step(state, action)

            # --- THE TD(0) UPDATE RULE ---
            # 1. Prediction: What we thought V(s) was
            current_value = V[state]

            # 2. Target: Reward + Discounted Value of Next State
            # (If next state is terminal, its value is 0)
            next_state_value = 0 if done else V[next_state]
            td_target = reward + gamma * next_state_value

            # 3. Update: Nudge current value towards the target
            # V(s) = V(s) + alpha * [ R + gamma*V(s') - V(s) ]
            V[state] = current_value + alpha * (td_target - current_value)

            state = next_state

    return V

# --- 3. EXECUTION ---
if __name__ == "__main__":
    env = GridWorld()

    # Run TD(0)
    v_values = td_zero_learning(env)

    print("\nState-Value Function V(s) learned via TD(0):")
    print("-" * 30)

    # Reshape and print
    print(np.round(v_values.reshape(4, 4), 2))

Running TD(0) Learning for 5000 episodes...

State-Value Function V(s) learned via TD(0):
------------------------------
[[  0.   -15.46 -20.66 -23.22]
 [-14.83 -19.04 -20.04 -19.77]
 [-21.01 -19.24 -15.65  -9.54]
 [-21.42 -18.69 -14.07   0.  ]]


### Explanation of the TD(0) Learning Code

This code implements a basic reinforcement learning example using the TD(0) (Temporal Difference(0)) prediction algorithm in a 4x4 GridWorld environment. Here's a breakdown:

1.  **GridWorld Class**: This class defines the environment. It's a 4x4 grid where states are represented by integers from 0 to 15. States 0 (top-left) and 15 (bottom-right) are terminal states. The `step` method takes a current state and an action (UP, DOWN, LEFT, RIGHT) and returns the next state, a reward (which is -1 for every non-terminal step), and whether the next state is terminal. The `reset` method initializes the agent to a random non-terminal starting state.

2.  **`td_zero_learning` Function**: This function implements the TD(0) algorithm to *evaluate* the value of each state (`V(s)`) under a given policy. In this case, the policy is a random walk, meaning at each step, the agent chooses an action randomly. The core of TD(0) is its update rule: `V(s) = V(s) + alpha * [ R + gamma*V(s') - V(s) ]`. It updates the estimated value of the current state (`V[state]`) based on the immediate reward (`R`), the discounted value of the next state (`gamma*V[next_state]`), and its current prediction (`V[state]`).

3.  **Execution Block**: The `if __name__ == "__main__":` block creates an instance of the `GridWorld` environment, runs the `td_zero_learning` function for 5000 episodes, and then prints the learned state-value function `V(s)` for all states, reshaped into a 4x4 grid for better visualization.