In [1]:
import numpy as np

class MDPGridWorld:
    def __init__(self):
        self.grid_size = 4
        # For "Rollout from (0,0)", we treat (0,0) as Start
        # and (3,3) [index 15] as the ONLY Goal/Terminal state.
        self.terminal_states = [15]
        self.actions = ['UP', 'DOWN', 'LEFT', 'RIGHT']
        self.gamma = 1.0 # Discount factor

    def get_next_state_reward(self, state, action):
        if state in self.terminal_states:
            return state, 0

        row, col = divmod(state, self.grid_size)

        # Move Logic
        if action == 'UP':    row = max(row - 1, 0)
        elif action == 'DOWN':  row = min(row + 1, self.grid_size - 1)
        elif action == 'LEFT':  col = max(col - 1, 0)
        elif action == 'RIGHT': col = min(col + 1, self.grid_size - 1)

        next_state = row * self.grid_size + col
        reward = -1 # Cost per step
        return next_state, reward

    def value_iteration(self):
        """1. PLAN: Calculate V* to find the best policy."""
        V = np.zeros(self.grid_size * self.grid_size)
        theta = 1e-4

        while True:
            delta = 0
            V_new = np.copy(V)
            for s in range(self.grid_size * self.grid_size):
                if s in self.terminal_states: continue

                action_values = []
                for a in self.actions:
                    ns, r = self.get_next_state_reward(s, a)
                    action_values.append(r + self.gamma * V[ns])

                new_val = max(action_values)
                V_new[s] = new_val
                delta = max(delta, abs(new_val - V[s]))
            V = V_new
            if delta < theta: break
        return V

    def rollout(self, start_state, V):
        """2. ACT: Execute the policy from the start state."""
        print(f"\n--- Rolling out Optimal Policy from State {start_state} (0,0) ---")

        curr_state = start_state
        steps = 0
        path = [curr_state]

        while curr_state not in self.terminal_states:
            row, col = divmod(curr_state, self.grid_size)

            # Find Best Action using V
            best_action = None
            best_val = -float('inf')

            for action in self.actions:
                ns, r = self.get_next_state_reward(curr_state, action)
                val = r + self.gamma * V[ns]

                if val > best_val:
                    best_val = val
                    best_action = action

            # Execute the Move
            next_state, _ = self.get_next_state_reward(curr_state, best_action)

            print(f"Step {steps+1}: At {curr_state} ({row},{col}) -> Action: {best_action} -> New State: {next_state}")

            curr_state = next_state
            path.append(curr_state)
            steps += 1

            if steps > 20: # Safety break
                print("Stuck in loop!")
                break

        print(f"\nGoal Reached at State {curr_state}!")
        print(f"Total Path: {path}")

# --- Main Execution ---
if __name__ == "__main__":
    world = MDPGridWorld()

    # 1. First, we need the plan (Value Iteration)
    print("Computing Optimal Policy...")
    optimal_values = world.value_iteration()

    # 2. Now, we Roll Out (Simulate) the plan from (0,0)
    world.rollout(start_state=0, V=optimal_values)

Computing Optimal Policy...

--- Rolling out Optimal Policy from State 0 (0,0) ---
Step 1: At 0 (0,0) -> Action: DOWN -> New State: 4
Step 2: At 4 (1,0) -> Action: DOWN -> New State: 8
Step 3: At 8 (2,0) -> Action: DOWN -> New State: 12
Step 4: At 12 (3,0) -> Action: RIGHT -> New State: 13
Step 5: At 13 (3,1) -> Action: RIGHT -> New State: 14
Step 6: At 14 (3,2) -> Action: RIGHT -> New State: 15

Goal Reached at State 15!
Total Path: [0, 4, 8, 12, 13, 14, 15]


This code defines a `MDPGridWorld` class that models a simple 4x4 grid world environment. It uses a common reinforcement learning algorithm called Value Iteration to find the optimal path from a starting point (0,0) to a goal state (3,3).

1.  **`MDPGridWorld` Class**: This class sets up the grid environment. It defines the grid size (4x4), the terminal state (index 15, which is (3,3) in a 0-indexed grid), possible actions ('UP', 'DOWN', 'LEFT', 'RIGHT'), and a discount factor (`gamma`).

2.  **`get_next_state_reward` Method**: This method simulates moving within the grid. Given a current `state` and an `action`, it calculates the `next_state` and the `reward`. In this grid, moving to any non-terminal state incurs a reward of -1 (a cost per step), and reaching the terminal state yields a reward of 0.

3.  **`value_iteration` Method**: This is the 'planning' phase. It implements the Value Iteration algorithm to compute the optimal value function (V*). This function `V*` tells us the maximum expected future reward from each state, assuming we act optimally. It iteratively updates the value of each state until the values converge, meaning they don't change significantly anymore.

4.  **`rollout` Method**: This is the 'acting' phase. Once the optimal value function `V` is computed, this method simulates an agent navigating the grid from a `start_state`. At each step, the agent chooses the action that leads to the state with the highest expected value, effectively following the optimal policy derived from `V`. It then prints the path taken until the goal is reached.

5.  **Main Execution Block**: The code first creates an instance of `MDPGridWorld`, then calls `value_iteration()` to calculate the `optimal_values`. Finally, it calls `rollout(start_state=0, V=optimal_values)` to demonstrate the optimal path an agent would take starting from state 0 (which is (0,0)).

In essence, the code first 'learns' the best way to navigate the grid by calculating the value of each state, and then 'shows' how an agent would use that knowledge to reach the goal efficiently.