In [5]:
import numpy as np

def load_mdp(filename):
    S, A = 0, 0
    terms = []
    trans = {}
    mdptype = None
    gamma = 0

    with open(filename, 'r') as f:
        for line in f:
            parts = line.split()
            if not parts:
                continue
            command = parts[0]
            if command == 'numStates':
                S = int(parts[1])
            elif command == 'numActions':
                A = int(parts[1])
            elif command == 'end':
                terms = [int(s) for s in parts[1:]]
            elif command == 'transition':
                s = int(parts[1])
                a = int(parts[2])
                s_prime = int(parts[3])
                reward = float(parts[4])
                prob = float(parts[5])
                if s not in trans:
                    trans[s] = {}
                if a not in trans[s]:
                    trans[s][a] = []
                trans[s][a].append((s_prime, reward, prob))
            elif command == 'mdptype':
                mdptype = parts[1]
            elif command == 'discount':
                gamma = float(parts[1])
    return S, A, terms, trans, mdptype, gamma

def value_iteration(S, A, terms, trans, gamma, epsilon=1e-6):
    V = np.zeros(S)
    pi = -np.ones(S, dtype=int)
    iterations = 0

    while True:
        iterations += 1
        delta = 0
        V_new = np.copy(V)

        for s in range(S):
            if s in terms:
                V_new[s] = 0
                pi[s] = -1
                continue

            max_q = -np.inf
            best_a = -1

            for a in range(A):
                q_sa = 0
                if s in trans and a in trans[s]:
                    for s_prime, reward, prob in trans[s][a]:
                        q_sa += prob * (reward + gamma * V[s_prime])

                if q_sa > max_q:
                    max_q = q_sa
                    best_a = a

            V_new[s] = max_q
            pi[s] = best_a

        delta = np.max(np.abs(V_new - V))
        V = np.copy(V_new)

        if delta < epsilon:
            break

    print(f"Converged in {iterations} iterations (δ={delta:.6f})")
    return V, pi

In [7]:
import glob
mdp_files = sorted(glob.glob('/content/sample_data/continuing-mdp-2-2.txt'))

for path in mdp_files:
    S, A, terms, trans, mdptype, gamma = load_mdp(path)
    print(f"\n--- Solving {path} ({mdptype}, γ={gamma}) ---")
    V, pi = value_iteration(S, A, terms, trans, gamma)
    for s in range(S):
        action = '–' if pi[s] < 0 else pi[s]
        print(f"State {s}: V* = {V[s]:.6f}\tπ* = {action}")



--- Solving /content/sample_data/continuing-mdp-2-2.txt (continuing, γ=0.96) ---
Converged in 305 iterations (δ=0.000001)
State 0: V* = 5.999276	π* = 0
State 1: V* = 5.918427	π* = 0


In [9]:
mdp_files = sorted(glob.glob('/content/sample_data/continuing-mdp-10-5.txt'))

for path in mdp_files:
    S, A, terms, trans, mdptype, gamma = load_mdp(path)
    print(f"\n--- Solving {path} ({mdptype}, γ={gamma}) ---")
    V, pi = value_iteration(S, A, terms, trans, gamma)
    for s in range(S):
        action = '–' if pi[s] < 0 else pi[s]
        print(f"State {s}: V* = {V[s]:.6f}\tπ* = {action}")



--- Solving /content/sample_data/continuing-mdp-10-5.txt (continuing, γ=0.8) ---
Converged in 60 iterations (δ=0.000001)
State 0: V* = 2.234954	π* = 3
State 1: V* = 2.373608	π* = 3
State 2: V* = 2.604042	π* = 3
State 3: V* = 2.647780	π* = 1
State 4: V* = 2.522227	π* = 4
State 5: V* = 2.375248	π* = 0
State 6: V* = 2.684802	π* = 2
State 7: V* = 2.688306	π* = 0
State 8: V* = 2.640805	π* = 3
State 9: V* = 2.572423	π* = 1


In [10]:
mdp_files = sorted(glob.glob('/content/sample_data/continuing-mdp-50-20.txt'))

for path in mdp_files:
    S, A, terms, trans, mdptype, gamma = load_mdp(path)
    print(f"\n--- Solving {path} ({mdptype}, γ={gamma}) ---")
    V, pi = value_iteration(S, A, terms, trans, gamma)
    for s in range(S):
        action = '–' if pi[s] < 0 else pi[s]
        print(f"State {s}: V* = {V[s]:.6f}\tπ* = {action}")



--- Solving /content/sample_data/continuing-mdp-50-20.txt (continuing, γ=0.2) ---
Converged in 10 iterations (δ=0.000000)
State 0: V* = 1.065078	π* = 7
State 1: V* = 1.051695	π* = 2
State 2: V* = 0.824259	π* = 7
State 3: V* = 0.601320	π* = 14
State 4: V* = 1.057797	π* = 4
State 5: V* = 0.980877	π* = 19
State 6: V* = 0.983041	π* = 18
State 7: V* = 1.002595	π* = 5
State 8: V* = 0.886921	π* = 15
State 9: V* = 0.837798	π* = 8
State 10: V* = 1.109280	π* = 8
State 11: V* = 0.910305	π* = 19
State 12: V* = 1.155357	π* = 7
State 13: V* = 0.958098	π* = 8
State 14: V* = 0.772395	π* = 18
State 15: V* = 1.218693	π* = 16
State 16: V* = 0.939597	π* = 11
State 17: V* = 0.840961	π* = 19
State 18: V* = 0.934034	π* = 2
State 19: V* = 0.899851	π* = 12
State 20: V* = 1.168103	π* = 14
State 21: V* = 0.985183	π* = 19
State 22: V* = 1.032489	π* = 14
State 23: V* = 1.110618	π* = 15
State 24: V* = 0.779151	π* = 0
State 25: V* = 0.945382	π* = 1
State 26: V* = 1.185461	π* = 3
State 27: V* = 1.083733	π* = 18
Stat

In [11]:
mdp_files = sorted(glob.glob('/content/sample_data/episodic-mdp-2-2.txt'))

for path in mdp_files:
    S, A, terms, trans, mdptype, gamma = load_mdp(path)
    print(f"\n--- Solving {path} ({mdptype}, γ={gamma}) ---")
    V, pi = value_iteration(S, A, terms, trans, gamma)
    for s in range(S):
        action = '–' if pi[s] < 0 else pi[s]
        print(f"State {s}: V* = {V[s]:.6f}\tπ* = {action}")



--- Solving /content/sample_data/episodic-mdp-2-2.txt (episodic, γ=0.9) ---
Converged in 27 iterations (δ=0.000001)
State 0: V* = 0.000000	π* = –
State 1: V* = 1.455816	π* = 0


In [12]:
mdp_files = sorted(glob.glob('/content/sample_data/episodic-mdp-10-5.txt'))

for path in mdp_files:
    S, A, terms, trans, mdptype, gamma = load_mdp(path)
    print(f"\n--- Solving {path} ({mdptype}, γ={gamma}) ---")
    V, pi = value_iteration(S, A, terms, trans, gamma)
    for s in range(S):
        action = '–' if pi[s] < 0 else pi[s]
        print(f"State {s}: V* = {V[s]:.6f}\tπ* = {action}")



--- Solving /content/sample_data/episodic-mdp-10-5.txt (episodic, γ=1.0) ---
Converged in 27073 iterations (δ=0.000001)
State 0: V* = 0.000000	π* = –
State 1: V* = 530.217612	π* = 3
State 2: V* = 530.511462	π* = 4
State 3: V* = 504.796539	π* = 2
State 4: V* = 472.946092	π* = 1
State 5: V* = 0.000000	π* = –
State 6: V* = 526.950797	π* = 2
State 7: V* = 518.462150	π* = 2
State 8: V* = 354.456197	π* = 4
State 9: V* = 529.289936	π* = 0


In [13]:
mdp_files = sorted(glob.glob('/content/sample_data/episodic-mdp-50-20.txt'))

for path in mdp_files:
    S, A, terms, trans, mdptype, gamma = load_mdp(path)
    print(f"\n--- Solving {path} ({mdptype}, γ={gamma}) ---")
    V, pi = value_iteration(S, A, terms, trans, gamma)
    for s in range(S):
        action = '–' if pi[s] < 0 else pi[s]
        print(f"State {s}: V* = {V[s]:.6f}\tπ* = {action}")



--- Solving /content/sample_data/episodic-mdp-50-20.txt (episodic, γ=0.9) ---
Converged in 130 iterations (δ=0.000001)
State 0: V* = 7.985534	π* = 16
State 1: V* = 7.837288	π* = 9
State 2: V* = 0.000000	π* = –
State 3: V* = 7.664208	π* = 18
State 4: V* = 7.830733	π* = 15
State 5: V* = 7.826870	π* = 12
State 6: V* = 7.943419	π* = 10
State 7: V* = 8.261761	π* = 4
State 8: V* = 7.869683	π* = 14
State 9: V* = 8.348362	π* = 5
State 10: V* = 7.711347	π* = 11
State 11: V* = 7.775422	π* = 0
State 12: V* = 7.914732	π* = 17
State 13: V* = 8.006124	π* = 16
State 14: V* = 8.101699	π* = 0
State 15: V* = 8.089329	π* = 15
State 16: V* = 0.000000	π* = –
State 17: V* = 7.652549	π* = 9
State 18: V* = 8.124849	π* = 4
State 19: V* = 7.843153	π* = 15
State 20: V* = 8.415752	π* = 12
State 21: V* = 7.321332	π* = 9
State 22: V* = 7.627947	π* = 2
State 23: V* = 7.984519	π* = 7
State 24: V* = 7.708902	π* = 13
State 25: V* = 7.777007	π* = 10
State 26: V* = 8.089608	π* = 15
State 27: V* = 5.340496	π* = 18
State 