<a href="https://colab.research.google.com/github/jahnviasthana/Reinforcement-learning-codes/blob/main/value_iteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np


num_rows = 4
num_cols = 4
num_states = num_rows * num_cols
num_actions = 4
gamma = 0.9



grid = {(i, j): [0, 1, 2, 3] for i in range(num_rows) for j in range(num_cols)}
terminal_states = [(0, 0), (3, 3)]
rewards = {(i, j): -1 for i in range(num_rows) for j in range(num_cols)}
for state in terminal_states:
    rewards[state] = 0


policy = {state: np.random.choice(num_actions) for state in grid}


def get_next_state(state, action, num_rows, num_cols):
    i, j = state
    if action == 0:  # Up
        i = max(0, i - 1)
    elif action == 1:  # Down
        i = min(num_rows - 1, i + 1)
    elif action == 2:  # Left
        j = max(0, j - 1)
    elif action == 3:  # Right
        j = min(num_cols - 1, j + 1)
    return (i, j)


def value_iteration(num_states, num_actions, grid, rewards, gamma, epsilon=1e-6):
    value_function = {state: 0 for state in grid}
    while True:
        delta = 0
        for state in grid:
            v = value_function[state]
            action_values = np.zeros(num_actions)
            for action in grid[state]:
                next_state = get_next_state(state, action, num_rows, num_cols)
                action_values[action] = rewards[state] + gamma * value_function[next_state]
            value_function[state] = np.max(action_values)
            delta = max(delta, abs(v - value_function[state]))
        if delta < epsilon:
            break
    return value_function


def policy_improvement(value_function, num_states, num_actions, grid, rewards, gamma):
    new_policy = {state: 0 for state in grid}
    policy_stable = True

    for state in grid:
        action_values = np.zeros(num_actions)

        for action in grid[state]:
            next_state = get_next_state(state, action, num_rows, num_cols)
            action_values[action] = rewards[state] + gamma * value_function[next_state]

        new_policy[state] = np.argmax(action_values)

        if new_policy[state] != policy[state]:
            policy_stable = False

    return new_policy, policy_stable


iteration = 0
while True:
    print(f"Iteration {iteration} Policy:")
    for i in range(num_rows):
        for j in range(num_cols):
            state = (i, j)
            action = policy[state]
            if state in terminal_states:
                print("T", end="\t")
            else:
                print(action, end="\t")
        print()

    value_function = value_iteration(num_states, num_actions, grid, rewards, gamma)
    new_policy, policy_stable = policy_improvement(value_function, num_states, num_actions, grid, rewards, gamma)
    if policy_stable:
        break
    policy = new_policy
    iteration += 1
    print("\n")

print("Optimal Policy:")
for i in range(num_rows):
    for j in range(num_cols):
        state = (i, j)
        action = policy[state]
        if state in terminal_states:
            print("T", end="\t")
        else:
            print(action, end="\t")
    print()


Iteration 0 Policy:
T	2	3	2	
2	1	1	2	
0	3	1	1	
0	3	0	T	


Iteration 1 Policy:
T	2	2	1	
0	0	0	1	
0	0	1	1	
0	3	3	T	
Optimal Policy:
T	2	2	1	
0	0	0	1	
0	0	1	1	
0	3	3	T	
