In [None]:
import numpy as np
import random

# Define a 4x4 Gridworld
class Gridworld:
    def __init__(self):
        self.rows = 4
        self.cols = 4
        self.n_states = self.rows * self.cols
        self.n_actions = 4  # up, down, left, right
        self.actions = ['up', 'down', 'left', 'right']
        self.state2idx = {(r, c): r*self.cols + c for r in range(self.rows) for c in range(self.cols)}
        self.idx2state = {idx: state for state, idx in self.state2idx.items()}
        self.terminal_state = (4, 4)
        self.rewards = {(r, c): -1 for r in range(self.rows) for c in range(self.cols) if (r, c) != self.terminal_state}
        self.rewards[self.terminal_state] = 0
        self.transition_probs = self._compute_transition_probs()

    def _compute_transition_probs(self):
        transition_probs = {}
        for state, idx in self.state2idx.items():
            for action in self.actions:
                transition_probs[(state, action)] = self.get_transition_prob(state, action)
        return transition_probs

    def get_transition_prob(self, state, action):
        if state == self.terminal_state:
            return [(1.0, state)]

        next_state = self._get_next_state(state, action)
        return [(1.0, next_state)]

    def _get_next_state(self, state, action):
        row, col = state
        if action == 'up':
            row = max(0, row - 1)
        elif action == 'down':
            row = min(self.rows - 1, row + 1)
        elif action == 'left':
            col = max(0, col - 1)
        elif action == 'right':
            col = min(self.cols - 1, col + 1)
        return (row, col)


def random_policy(env):
    # Define a random policy where each action is chosen with equal probability
    return {state: random.choice(env.actions) for state in env.state2idx.keys() if state != env.terminal_state}

def policy_evaluation(env, policy, gamma=1, theta=1e-6, max_iterations=100):
    # Initialize value function to zeros
    V = np.zeros(env.n_states)

    for _ in range(max_iterations):
        prev_V = V.copy()
        for state, idx in env.state2idx.items():
            if state == env.terminal_state:
                continue
            v = V[idx]
            action = policy[state]
            transition_probs = env.transition_probs[(state, action)]
            new_v = sum(prob * (env.rewards[next_state] + gamma * V[env.state2idx[next_state]]) for prob, next_state in transition_probs)
            V[idx] = new_v
        if np.max(np.abs(prev_V - V)) < theta:
            break
    return V

# Create 4x4 Gridworld
env = Gridworld()

# Define random policy
policy = random_policy(env)

# Perform policy evaluation
V = policy_evaluation(env, policy)

# Reshape the value function to match the grid shape
V_grid = np.reshape(V, (env.rows, env.cols))
print(V_grid)

[[-100. -101. -102. -103.]
 [-101. -198. -199. -100.]
 [-100. -199. -200. -100.]
 [-199. -200. -201. -100.]]
