In [16]:
# This code is written by Hemanth Chebiyam.
# Email: hc3746@rit.edu
import numpy as np

In [17]:
class Gridworld:
    def __init__(self):
        self.grid_size = 5
        self.obstacles = [(2, 2), (3, 2)]
        self.water_state = (4, 2)
        self.goal_state = (4, 4)
        self.actions = ['AU', 'AD', 'AL', 'AR']
        self.transition_probs = {'AU': {'AU': 0.8, 'AD': 0.05, 'AL': 0.05, 'AR': 0.1},
                                 'AD': {'AU': 0.05, 'AD': 0.8, 'AL': 0.1, 'AR': 0.05},
                                 'AL': {'AU': 0.05, 'AD': 0.1, 'AL': 0.8, 'AR': 0.05},
                                 'AR': {'AU': 0.1, 'AD': 0.05, 'AL': 0.05, 'AR': 0.8}}
        self.rewards = np.zeros((self.grid_size, self.grid_size))
        self.rewards[self.water_state] = -10
        self.rewards[self.goal_state] = 10

    def is_valid_state(self, state):
        x, y = state
        return 0 <= x < self.grid_size and 0 <= y < self.grid_size and state not in self.obstacles

    def transition(self, state, action):
        if not self.is_valid_state(state):
            return state

        if np.random.rand() < 0.1:  # Temporary break
            return state

        if np.random.rand() < 0.05:  # Confusion - veer right
            action = self._veer_right(action)
        elif np.random.rand() < 0.05:  # Confusion - veer left
            action = self._veer_left(action)

        next_state = self._move(state, action)
        return next_state if self.is_valid_state(next_state) else state

    def _move(self, state, action):
        x, y = state
        if action == 'AU':
            return x - 1, y
        elif action == 'AD':
            return x + 1, y
        elif action == 'AL':
            return x, y - 1
        elif action == 'AR':
            return x, y + 1

    def _veer_right(self, action):
        if action == 'AU':
            return 'AR'
        elif action == 'AD':
            return 'AL'
        elif action == 'AL':
            return 'AD'
        elif action == 'AR':
            return 'AU'


    def _veer_left(self, action):
        if action == 'AU':
            return 'AL'
        elif action == 'AD':
            return 'AR'
        elif action == 'AL':
            return 'AU'
        elif action == 'AR':
            return 'AD'



In [18]:
class Agent:
    def __init__(self, gridworld):
        self.gridworld = gridworld

    def uniform_random_policy(self, state):
        return np.random.choice(self.gridworld.actions)

    def discounted_return(self, episode, gamma=0.9):
        discounted_rewards = [step[2] * (gamma ** i) for i, step in enumerate(episode)]
        return sum(discounted_rewards)

    def play_episode(self, policy):
        episode = []
        state = (0, 0)
        while state != self.gridworld.goal_state:
            action = policy(state)
            next_state = self.gridworld.transition(state, action)
            reward = self.gridworld.rewards[next_state]
            episode.append((state, action, reward))
            state = next_state
        return episode



In [19]:
def evaluate_policy(policy, num_episodes):
    agent = Agent(gridworld)
    returns = []
    for _ in range(num_episodes):
        episode = agent.play_episode(policy)
        returns.append(agent.discounted_return(episode))
    return returns



In [20]:
def value_iteration(gridworld, gamma=0.9, epsilon=1e-6):
    V = np.zeros((gridworld.grid_size, gridworld.grid_size))
    while True:
        delta = 0
        for i in range(gridworld.grid_size):
            for j in range(gridworld.grid_size):
                if (i, j) == gridworld.goal_state:
                    continue
                max_val = -np.inf
                for action in gridworld.actions:
                    next_state = gridworld.transition((i, j), action)
                    reward = gridworld.rewards[next_state]
                    val = reward + gamma * V[next_state[0], next_state[1]]
                    max_val = max(max_val, val)
                delta = max(delta, np.abs(max_val - V[i, j]))
                V[i, j] = max_val
        if delta < epsilon:
            break

    optimal_policy = {}
    for i in range(gridworld.grid_size):
        for j in range(gridworld.grid_size):
            if (i, j) == gridworld.goal_state:
                continue
            max_val = -np.inf
            best_action = None
            for action in gridworld.actions:
                next_state = gridworld.transition((i, j), action)
                reward = gridworld.rewards[next_state]
                val = reward + gamma * V[next_state[0], next_state[1]]
                if val > max_val:
                    max_val = val
                    best_action = action
            optimal_policy[(i, j)] = best_action

    return optimal_policy



In [21]:
if __name__ == "__main__":
    gridworld = Gridworld()

    # Task 1: Uniform random policy
    agent = Agent(gridworld)
    random_returns = evaluate_policy(agent.uniform_random_policy, 10000)
    print("Task 1 Results:")
    print("Mean:", np.mean(random_returns))
    print("Standard Deviation:", np.std(random_returns))
    print("Maximum:", np.max(random_returns))
    print("Minimum:", np.min(random_returns))

    

Task 1 Results:
Mean: -0.6063058120673711
Standard Deviation: 2.295495381218213
Maximum: 4.304672100000001
Minimum: -47.01162157647506


In [22]:
if __name__ == "__main__":
    gridworld = Gridworld()
    # Task 2: Optimal policy using value iteration
    optimal_policy = value_iteration(gridworld)
    print("\nOptimal Policy:")
    for i in range(gridworld.grid_size):
        for j in range(gridworld.grid_size):
            if (i, j) in optimal_policy:
                print(optimal_policy[(i, j)], end="\t")
            else:
                print("-", end="\t")
        print()

    


Optimal Policy:
AD	AD	AD	AD	AU	
AR	AR	AR	AD	AD	
AU	AU	AU	AD	AD	
AU	AR	AU	AD	AD	
AU	AU	AR	AR	-	


In [23]:
if __name__ == "__main__":
    gridworld = Gridworld()
    # Task 3: Run optimal policy and compare results
    optimal_returns = evaluate_policy(lambda s: optimal_policy[s], 10000)
    print("\nTask 3 Results:")
    print("Mean:", np.mean(optimal_returns))
    print("Standard Deviation:", np.std(optimal_returns))
    print("Maximum:", np.max(optimal_returns))
    print("Minimum:", np.min(optimal_returns))


Task 3 Results:
Mean: 3.921796074201619
Standard Deviation: 0.712729491061633
Maximum: 4.7829690000000005
Minimum: 0.033813919135227306


In [24]:
def visualize_path(gridworld, policy):
    state = (0, 0)
    total_reward = 0
    print("Starting at:", state)
    while state != gridworld.goal_state:
        action = policy[state]
        next_state = gridworld.transition(state, action)
        reward = gridworld.rewards[next_state]
        total_reward += reward
        print("Action:", action, "| Next state:", next_state, "| Reward:", reward, "| Total Reward:", total_reward)
        state = next_state
    print("Reached the goal state with a total reward of:", total_reward)

In [25]:
if __name__ == "__main__":
    gridworld = Gridworld()
    # Visualize path of the agent following the optimal policy
    print("\nFollowing optimal policy to reach the goal state:")
    visualize_path(gridworld, optimal_policy)


Following optimal policy to reach the goal state:
Starting at: (0, 0)
Action: AD | Next state: (0, 0) | Reward: 0.0 | Total Reward: 0.0
Action: AD | Next state: (1, 0) | Reward: 0.0 | Total Reward: 0.0
Action: AR | Next state: (1, 1) | Reward: 0.0 | Total Reward: 0.0
Action: AR | Next state: (1, 2) | Reward: 0.0 | Total Reward: 0.0
Action: AR | Next state: (1, 3) | Reward: 0.0 | Total Reward: 0.0
Action: AD | Next state: (2, 3) | Reward: 0.0 | Total Reward: 0.0
Action: AD | Next state: (3, 3) | Reward: 0.0 | Total Reward: 0.0
Action: AD | Next state: (3, 4) | Reward: 0.0 | Total Reward: 0.0
Action: AD | Next state: (4, 4) | Reward: 10.0 | Total Reward: 10.0
Reached the goal state with a total reward of: 10.0
