In [5]:
import numpy as np

# Define grid dimensions
GRID_WIDTH = 5
GRID_HEIGHT = 4

# Define rewards and transitions
REWARDS = {
    'goal': 10,
    'obstacle': -10,
    'step': -1
}

# Define action space: up, down, left, right
ACTIONS = [(0, -1), (0, 1), (-1, 0), (1, 0)]

# Define parameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epochs = 1000

# Initialize value function
V = np.zeros((GRID_HEIGHT, GRID_WIDTH))

# Start TD(0) algorithm
for _ in range(epochs):
    # Initialize starting state
    state = (0, 0)  # Start position

    while state != (GRID_HEIGHT - 1, GRID_WIDTH - 1):  # Continue until reaching the goal
        # Choose action
        action = np.random.choice(len(ACTIONS))  # Random action selection

        # Take action and observe next state and reward
        next_state = (state[0] + ACTIONS[action][0], state[1] + ACTIONS[action][1])
        next_state = (min(max(next_state[0], 0), GRID_HEIGHT - 1), min(max(next_state[1], 0), GRID_WIDTH - 1))  # Ensure next state is within grid boundaries

        if next_state == (GRID_HEIGHT - 1, GRID_WIDTH - 1):  # Reached goal
            reward = REWARDS['goal']
        elif next_state == state:  # Hit obstacle
            reward = REWARDS['obstacle']
        else:
            reward = REWARDS['step']

        # Update value function
        V[state[0], state[1]] += alpha * (reward + gamma * V[next_state[0], next_state[1]] - V[state[0], state[1]])

        # Move to next state
        state = next_state

# Print the estimated optimal value function
print("Optimal Value Function:")
print(V)


Optimal Value Function:
[[-29.25897645 -25.26209284 -24.64218742 -26.67726014 -31.52834723]
 [-28.65016382 -22.40854823 -20.58982039 -19.71595839 -25.64160392]
 [-26.82229635 -21.29126284 -18.98481763 -14.89171007 -12.52016425]
 [-32.56750966 -24.6544248  -19.2109392   -6.57347466   0.        ]]
