<a href="https://colab.research.google.com/github/jahnviasthana/Reinforcement-learning-codes/blob/main/mdp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np


num_states = 3
num_actions = 2
discount_factor = 0.9
transition_probs = np.array([
    [[0.7, 0.3, 0.0], [0.1, 0.8, 0.1]],
    [[0.0, 0.2, 0.8], [0.4, 0.4, 0.2]],
    [[0.4, 0.6, 0.0], [0.0, 0.5, 0.5]]
])


rewards = np.array([
    [-1, 1],
    [2, -1],
    [0, 1]
])
policy = np.ones((num_states, num_actions)) / num_actions


def policy_evaluation(policy, transition_probs, rewards, discount_factor, num_iterations=100):
    value_function = np.zeros(num_states)

    for _ in range(num_iterations):
        for state in range(num_states):
            value_function[state] = sum(
                policy[state, action] * (rewards[state, action] +
                                         discount_factor * sum(value_function[next_state] * transition_probs[state, action, next_state]
                                                              for next_state in range(num_states)))
                for action in range(num_actions)
            )

    return value_function


def policy_improvement(value_function, transition_probs, rewards, discount_factor):
    new_policy = np.zeros_like(policy)

    for state in range(num_states):
        q_values = np.zeros(num_actions)

        for action in range(num_actions):
            q_values[action] = rewards[state, action] + discount_factor * sum(value_function[next_state] * transition_probs[state, action, next_state]
                                                                             for next_state in range(num_states))

        best_action = np.argmax(q_values)
        new_policy[state, best_action] = 1.0

    return new_policy


for i in range(10):
    value_function = policy_evaluation(policy, transition_probs, rewards, discount_factor)
    new_policy = policy_improvement(value_function, transition_probs, rewards, discount_factor)

    if np.array_equal(policy, new_policy):
        print(f"Converged after {i} iterations.")
        break

    policy = new_policy

print("Optimal Policy:")
print(policy)
print("Optimal Value Function:")
print(value_function)


Converged after 1 iterations.
Optimal Policy:
[[0. 1.]
 [1. 0.]
 [0. 1.]]
Optimal Value Function:
[13.77692761 14.33070472 13.54330336]
