In [40]:
import numpy as np
import numpy.typing as npt
class mdpprog:
    def __init__(self):
        self.rewards = {'S0': 4,'S1': 7,'S2': 1}
        self.transition_matrices = {
            #random values assumed for a0 matrix (as not given in the question)
            'a0': np.array([
                [0.25, 0.6, 0.4],
                [0.45, 0.4, 0.30],
                [0.25, 0.3, 0.5]]),
            'a1': np.array([
                [0.3, 0.4, 0.3],
                [0.6, 0.2, 0.2],
                [0.2, 0.2, 0.6]]),
            'a2': np.array([
                [0.5, 0.4, 0.1],
                [0.3, 0.2, 0.5],
                [0.4, 0.2, 0.4]]),
            'a3': np.array([
                [0.25, 0.25, 0.5],
                [0.4, 0.3, 0.3],
                [0.2, 0.5, 0.3]])}
        self.policyfunc = {
            'S0': 'a1','S1': 'a0','S2': 'a2'}
    def calclongtermreward(self, discountfactor: float, maxiter: int = 5000) -> tuple[npt.NDArray, int, list[float]]:
        V = np.zeros(3)
        convergencehistory = []
        for iteration in range(maxiter):
            V_prev = V.copy()
            for state_idx in range(3):
                action = self.policyfunc[f'S{state_idx}']
                expectedvalue = np.dot(
                    self.transition_matrices[action][state_idx],
                    V_prev)
                V[state_idx] = self.rewards[f'S{state_idx}'] + discountfactor * expectedvalue
            diff = np.max(np.abs(V - V_prev))
            convergencehistory.append(diff)
            if diff < 1e-8:
                break
        return V, iteration + 1, convergencehistory
    def mdpsol(self, discountfactors: list[float]):
        print("The MDP Problem Solution:\n")
        for gamma in discountfactors:
            print(f"Discount Factor (γ): {gamma}")
            longtermrewards, iterations, convergence = self.calclongtermreward(gamma)
            print("Long-Term Discounted Rewards:")
            for i, reward in enumerate(longtermrewards):
                print(f"State S{i}: {reward:.4f}")
            print(f"Convergence Iterations: {iterations}\n")
solver = mdpprog()
discountfactors = [0.1, 0.01, 0.001, 0.3]
solver.mdpsol(discountfactors)

The MDP Problem Solution:

Discount Factor (γ): 0.1
Long-Term Discounted Rewards:
State S0: 4.4777
State S1: 7.5449
State S2: 1.3854
Convergence Iterations: 10

Discount Factor (γ): 0.01
Long-Term Discounted Rewards:
State S0: 4.0434
State S1: 7.0495
State S2: 1.0344
Convergence Iterations: 6

Discount Factor (γ): 0.001
Long-Term Discounted Rewards:
State S0: 4.0043
State S1: 7.0049
State S2: 1.0034
Convergence Iterations: 4

Discount Factor (γ): 0.3
Long-Term Discounted Rewards:
State S0: 5.8501
State S1: 9.1134
State S2: 2.5555
Convergence Iterations: 19

