In [6]:
!pip install numpy


Defaulting to user installation because normal site-packages is not writeable


In [7]:
import numpy as np
class MDP:
    def __init__(self, num_states, num_actions, gamma=0.9, epsilon=1e-6):
        self.num_states = num_states
        self.num_actions = num_actions
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Convergence threshold
        
        self.P = np.random.rand(num_states, num_actions, num_states)
        self.P /= np.sum(self.P, axis=2, keepdims=True)  # Normalize to ensure valid probabilities
        
        self.R = np.random.randn(num_states, num_actions)
        
    def get_reward(self, s, a):
        return self.R[s, a]

    def get_transition_prob(self, s, a, s_next):
        return self.P[s, a, s_next]

    def get_all_possible_actions(self, s):
        return range(self.num_actions)
        
# Value Iteration
def value_iteration(mdp):
    # Initialize value function with zeros
    V = np.zeros(mdp.num_states)
    
    while True:
        delta = 0
        V_new = np.copy(V)
        
        for s in range(mdp.num_states):
            action_values = []
            for a in range(mdp.num_actions):
                value = 0
                for s_next in range(mdp.num_states):
                    prob = mdp.get_transition_prob(s, a, s_next)
                    reward = mdp.get_reward(s, a)
                    value += prob * (reward + mdp.gamma * V[s_next])
                action_values.append(value)
            
            V_new[s] = max(action_values)
        
        delta = np.max(np.abs(V_new - V))
        
        if delta < mdp.epsilon:
            break
        
        V = V_new
    
    return V

num_states = 5  # Number of states
num_actions = 3  # Number of actions
mdp = MDP(num_states, num_actions)

optimal_value_function = value_iteration(mdp)

print("Optimal Value Function:")
print(optimal_value_function)


Optimal Value Function:
[ 9.70011846 10.59293426 12.48985383 10.90933997 11.82325486]
