In [4]:
import numpy as np

# GridWorld settings
N = 4  # Grid size (4x4)
gamma = 1.0  # No discounting
threshold = 1e-4  # Convergence threshold

actions = [(-1, 0), (1, 0), (0, -1), (0, 1)]  # Up, Down, Left, Right

def is_terminal(state):
    return state == (N-1, N-1)

def get_next_states(state):
    """ Returns possible next states and transition probabilities."""
    next_states = []
    for action in actions:
        new_state = (state[0] + action[0], state[1] + action[1])
        # Ensure within bounds
        if 0 <= new_state[0] < N and 0 <= new_state[1] < N:
            next_states.append(new_state)
        else:
            next_states.append(state)  # If out of bounds, stay in same state
    return next_states

def value_iteration():
    V = np.zeros((N, N))  # Initialize V(s) to 0 for all states

    while True:
        delta = 0  # Track max change in values
        V_new = np.copy(V)  # Copy of current values

        for i in range(N):
            for j in range(N):
                # if it reaches the terminal state
                if is_terminal((i, j)):
                    continue  # Skip terminal state

                next_states = get_next_states((i, j))
                expected_values = [-1 + gamma * V[s] for s in next_states]  # Bellman Update

                V_new[i, j] = np.mean(expected_values)  # Average over actions (equal probability)
                delta = max(delta, abs(V_new[i, j] - V[i, j]))

        V = V_new  # Update value function
        if delta < threshold:
            break  # Stop if converged

    return V

# Run value iteration
final_values = value_iteration()
print(final_values)

[[-59.42367735 -57.42387125 -54.2813141  -51.71012579]
 [-57.42387125 -54.56699476 -49.71029394 -45.13926711]
 [-54.2813141  -49.71029394 -40.85391609 -29.99766609]
 [-51.71012579 -45.13926711 -29.99766609   0.        ]]
