In [1]:
import numpy as np
import random

class GridEnvironment:
    def __init__(self, gamma=0.9):
        self.grid_size = (3, 3)  # 3x3 grid
        self.states = [(i, j) for i in range(self.grid_size[0]) for j in range(self.grid_size[1])]
        self.actions = ['UP', 'DOWN', 'LEFT', 'RIGHT']

        # Initialize rewards
        self.rewards = {(0, 2): 10, (2, 0): -10}  # Specific rewards for terminal states
        for x in range(self.grid_size[0]):
            for y in range(self.grid_size[1]):
                if (x, y) not in [(0, 2), (2, 0)]:
                    self.rewards[(x, y)] = -0.01  # Life reward

        self.gamma = gamma
        self.value_table = np.zeros(len(self.states))
        self.policy = np.zeros(len(self.states), dtype=int)

    def get_state_index(self, state):
        return state[0] * self.grid_size[1] + state[1]

    def get_next_state(self, state, action):
        prob_success = 0.8
        prob_left = 0.1
        prob_right = 0.1

        if action == 'UP':
            next_states = [(max(state[0] - 1, 0), state[1]), (state[0], state[1]), (min(state[0] + 1, 2), state[1])]
        elif action == 'DOWN':
            next_states = [(min(state[0] + 1, 2), state[1]), (state[0], state[1]), (max(state[0] - 1, 0), state[1])]
        elif action == 'LEFT':
            next_states = [(state[0], max(state[1] - 1, 0)), (state[0], state[1]), (state[0], min(state[1] + 1, 2))]
        elif action == 'RIGHT':
            next_states = [(state[0], min(state[1] + 1, 2)), (state[0], state[1]), (state[0], max(state[1] - 1, 0))]

        probabilities = [prob_success, prob_left / 2, prob_right / 2]
        next_state_probs = {state: 0 for state in self.states}

        for next_state, prob in zip(next_states, probabilities):
            next_state_probs[next_state] += prob

        return next_state_probs

    def value_iteration(self, theta=1e-6):
        while True:
            delta = 0
            for state in self.states:
                v = self.value_table[self.get_state_index(state)]
                value_action = []
                for action in self.actions:
                    next_state_probs = self.get_next_state(state, action)
                    value = sum(prob * (self.rewards.get(next_state, 0) + self.gamma * self.value_table[self.get_state_index(next_state)])
                                for next_state, prob in next_state_probs.items())
                    value_action.append(value)
                self.value_table[self.get_state_index(state)] = max(value_action)
                delta = max(delta, abs(v - self.value_table[self.get_state_index(state)]))
            if delta < theta:
                break

    def extract_policy(self):
        for state in self.states:
            q_values = []
            for action in self.actions:
                next_state_probs = self.get_next_state(state, action)
                q_value = sum(prob * (self.rewards.get(next_state, 0) + self.gamma * self.value_table[self.get_state_index(next_state)])
                              for next_state, prob in next_state_probs.items())
                q_values.append(q_value)
            self.policy[self.get_state_index(state)] = np.argmax(q_values)

    def simulate(self, episodes=100):
        total_rewards = []
        for _ in range(episodes):
            state = random.choice(self.states)  # Random starting state
            total_reward = 0
            while True:
                action = self.actions[self.policy[self.get_state_index(state)]]
                next_state_probs = self.get_next_state(state, action)
                next_states = list(next_state_probs.keys())
                probabilities = list(next_state_probs.values())
                state = random.choices(next_states, probabilities)[0]
                total_reward += self.rewards.get(state, 0)
                if state in [(0, 2), (2, 0)]:  # Terminal states
                    break
            total_rewards.append(total_reward)
        return np.mean(total_rewards)

if __name__ == "__main__":
    gamma_value = 0.9
    environment = GridEnvironment(gamma=gamma_value)

    # Perform Value Iteration
    environment.value_iteration()

    # Extract the optimal policy
    environment.extract_policy()

    # Simulate episodes and evaluate average reward
    average_reward = environment.simulate(episodes=100)

    print("Optimal Policy:", environment.policy)
    print("Average Reward over 100 episodes:", average_reward)

Optimal Policy: [3 3 0 3 3 0 3 0 0]
Average Reward over 100 episodes: 9.787799999999999
