In [None]:
import numpy as np

class GridWorld:
    def __init__(self, rows, cols, start, goal, obstacles):
        self.rows = rows
        self.cols = cols
        self.start = start
        self.goal = goal
        self.obstacles = obstacles
        self.discount_factor = 0.9
        self.actions = ["up", "down", "left", "right"]

    def transitions(self, state, action):
        row, col = state

        if action == "up":
            next_state = (max(row - 1, 0), col)
        elif action == "down":
            next_state = (min(row + 1, self.rows - 1), col)
        elif action == "left":
            next_state = (row, max(col - 1, 0))
        elif action == "right":
            next_state = (row, min(col + 1, self.cols - 1))

        reward = 1 if next_state == self.goal else 0
        return [(next_state, reward, 1.0)]

def value_iteration(grid_world, epsilon=1e-6):
    V = np.zeros((grid_world.rows, grid_world.cols))

    while True:
        delta = 0
        for row in range(grid_world.rows):
            for col in range(grid_world.cols):
                if (row, col) == grid_world.goal:
                    continue

                v = V[row, col]
                max_value = max(
                    sum(prob * (reward + grid_world.discount_factor * V[next_row, next_col])
                        for (next_row, next_col), reward, prob in grid_world.transitions((row, col), action))
                    for action in grid_world.actions
                )

                V[row, col] = max_value
                delta = max(delta, abs(v - V[row, col]))

        if delta < epsilon:
            break

    return V

def main():
    grid_world = GridWorld(rows=3, cols=4, start=(2, 0), goal=(0, 3), obstacles=[])
    optimal_value_function = value_iteration(grid_world)

    print("Optimal Value Function:")
    print(optimal_value_function)

if __name__ == "__main__":
    main()


In [9]:
import numpy as np

class GridWorld:
    def __init__(self, rows, cols, start, goal, obstacles):
        self.rows = rows
        self.cols = cols
        self.start = start
        self.goal = goal
        self.obstacles = obstacles
        self.discount_factor = 0.9
        self.actions = ["up", "down", "left", "right"]

    def transitions(self, state, action):
        row, col = state

        if action == "up":
            next_state = (max(row - 1, 0), col)
        elif action == "down":
            next_state = (min(row + 1, self.rows - 1), col)
        elif action == "left":
            next_state = (row, max(col - 1, 0))
        elif action == "right":
            next_state = (row, min(col + 1, self.cols - 1))

        reward = 1 if next_state == self.goal else 0
        return [(next_state, reward, 1.0)]

def value_iteration(grid_world, epsilon=1e-6):
    V = np.zeros((grid_world.rows, grid_world.cols))

    iteration = 0  # Add an iteration counter

    while True:
        delta = 0
        for row in range(grid_world.rows):
            for col in range(grid_world.cols):
                if (row, col) == grid_world.goal:
                    continue

                v = V[row, col]
                max_value = max(
                    sum(prob * (reward + grid_world.discount_factor * V[next_row, next_col])
                        for (next_row, next_col), reward, prob in grid_world.transitions((row, col), action))
                    for action in grid_world.actions
                )

                V[row, col] = max_value
                delta = max(delta, abs(v - V[row, col]))

        # Print the value function for every iteration
        print(f"Iteration {iteration} - Value Function:")
        print(V)

        iteration += 1

        if delta < epsilon:
            break

    return V

def main():
    grid_world = GridWorld(rows=3, cols=4, start=(2, 0), goal=(0, 3), obstacles=[])
    optimal_value_function = value_iteration(grid_world)

    print("Optimal Value Function:")
    print(optimal_value_function)

if __name__ == "__main__":
    main()

Iteration 0 - Value Function:
[[0.   0.   1.   0.  ]
 [0.   0.   0.9  1.  ]
 [0.   0.   0.81 0.9 ]]
Iteration 1 - Value Function:
[[0.    0.9   1.    0.   ]
 [0.    0.81  0.9   1.   ]
 [0.    0.729 0.81  0.9  ]]
Iteration 2 - Value Function:
[[0.81   0.9    1.     0.    ]
 [0.729  0.81   0.9    1.    ]
 [0.6561 0.729  0.81   0.9   ]]
Iteration 3 - Value Function:
[[0.81   0.9    1.     0.    ]
 [0.729  0.81   0.9    1.    ]
 [0.6561 0.729  0.81   0.9   ]]
Optimal Value Function:
[[0.81   0.9    1.     0.    ]
 [0.729  0.81   0.9    1.    ]
 [0.6561 0.729  0.81   0.9   ]]
