<a href="https://colab.research.google.com/github/jahnviasthana/Reinforcement-learning-codes/blob/main/Policy_iteration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np


num_rows = 4
num_cols = 4
num_states = num_rows * num_cols
num_actions = 4
gamma = 0.9


# Create a dictionary to represent the grid with states as keys and actions as values
grid = {(i, j): [0, 1, 2, 3] for i in range(num_rows) for j in range(num_cols)}
terminal_states = [(0, 0), (3, 3)]
rewards = {(i, j): -1 for i in range(num_rows) for j in range(num_cols)}
for state in terminal_states:
    rewards[state] = 0

# Initialize a random policy
policy = {state: np.random.choice(num_actions) for state in grid}

# Define a function to perform policy evaluation
def policy_evaluation(policy, num_states, num_actions, grid, rewards, gamma, epsilon=1e-6):
    value_function = {state: 0 for state in grid}
    while True:
        delta = 0
        for state in grid:
            v = value_function[state]
            action = policy[state]
            next_state = get_next_state(state, action, num_rows, num_cols)
            reward = rewards[state]
            value_function[state] = reward + gamma * value_function[next_state]
            delta = max(delta, abs(v - value_function[state]))
        if delta < epsilon:
            break
    return value_function

# Define a function to get the next state given the current state and action
def get_next_state(state, action, num_rows, num_cols):
    i, j = state
    if action == 0:  # Up
        i = max(0, i - 1)
    elif action == 1:  # Down
        i = min(num_rows - 1, i + 1)
    elif action == 2:  # Left
        j = max(0, j - 1)
    elif action == 3:  # Right
        j = min(num_cols - 1, j + 1)
    return (i, j)

# Define a function to perform policy improvement
def policy_improvement(value_function, num_states, num_actions, grid, rewards, gamma):
    policy_stable = True
    for state in grid:
        old_action = policy[state]
        action_values = np.zeros(num_actions)
        for action in grid[state]:
            next_state = get_next_state(state, action, num_rows, num_cols)
            action_values[action] = rewards[state] + gamma * value_function[next_state]
        policy[state] = np.argmax(action_values)
        if old_action != policy[state]:
            policy_stable = False
    return policy, policy_stable

# Main policy iteration loop
iteration = 0
while True:
    print(f"Iteration {iteration} Policy:")
    for i in range(num_rows):
        for j in range(num_cols):
            state = (i, j)
            action = policy[state]
            if state in terminal_states:
                print("T", end="\t")
            else:
                print(action, end="\t")
        print()

    value_function = policy_evaluation(policy, num_states, num_actions, grid, rewards, gamma)
    new_policy, policy_stable = policy_improvement(value_function, num_states, num_actions, grid, rewards, gamma)
    if policy_stable:
        break
    policy = new_policy
    iteration += 1
    print("\n")

print("Optimal Policy:")
for i in range(num_rows):
    for j in range(num_cols):
        state = (i, j)
        action = policy[state]
        if state in terminal_states:
            print("T", end="\t")
        else:
            print(action, end="\t")
    print()


Iteration 0 Policy:
T	0	0	2	
2	3	3	0	
3	2	0	1	
2	3	1	T	


Iteration 1 Policy:
T	2	0	2	
0	0	0	1	
0	1	3	1	
1	1	3	T	


Iteration 2 Policy:
T	2	2	1	
0	0	1	1	
0	0	1	1	
0	3	3	T	


Iteration 3 Policy:
T	2	2	1	
0	0	0	1	
0	0	1	1	
0	3	3	T	
Optimal Policy:
T	2	2	1	
0	0	0	1	
0	0	1	1	
0	3	3	T	
