<a href="https://colab.research.google.com/github/hariniiy/RL_1796/blob/main/Ass_1_2303A51796.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np

class MDP:
    def __init__(self, states, actions, transition_probs, rewards, gamma=0.9):
        self.states = states
        self.actions = actions
        self.P = transition_probs  # transition probabilities P[s][a] = [(prob, next_state)]
        self.R = rewards           # rewards R[s][a][s']
        self.gamma = gamma

def value_iteration(mdp, theta=1e-6, max_iterations=1000):
    V = np.zeros(len(mdp.states))
    for _ in range(max_iterations):
        delta = 0
        for s in mdp.states:
            v = V[s]
            V[s] = max(
                sum(prob * (mdp.R[s][a][s_next] + mdp.gamma * V[s_next])
                    for prob, s_next in mdp.P[s][a])
                for a in mdp.actions
            )
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break

    # Derive policy from value function
    policy = np.zeros(len(mdp.states), dtype=int)
    for s in mdp.states:
        q_values = []
        for a in mdp.actions:
            q_values.append(
                sum(prob * (mdp.R[s][a][s_next] + mdp.gamma * V[s_next])
                    for prob, s_next in mdp.P[s][a])
            )
        policy[s] = np.argmax(q_values)

    return policy, V

def policy_evaluation(policy, mdp, theta=1e-6, max_iterations=1000):
    V = np.zeros(len(mdp.states))
    for _ in range(max_iterations):
        delta = 0
        for s in mdp.states:
            v = V[s]
            a = policy[s]
            V[s] = sum(prob * (mdp.R[s][a][s_next] + mdp.gamma * V[s_next])
                       for prob, s_next in mdp.P[s][a])
            delta = max(delta, abs(v - V[s]))
        if delta < theta:
            break
    return V

def policy_iteration(mdp, max_iterations=1000):
    policy = np.random.choice(mdp.actions, size=len(mdp.states))
    for _ in range(max_iterations):
        V = policy_evaluation(policy, mdp)
        policy_stable = True
        for s in mdp.states:
            old_action = policy[s]
            q_values = []
            for a in mdp.actions:
                q_values.append(
                    sum(prob * (mdp.R[s][a][s_next] + mdp.gamma * V[s_next])
                        for prob, s_next in mdp.P[s][a])
                )
            best_action = np.argmax(q_values)
            policy[s] = best_action
            if old_action != best_action:
                policy_stable = False
        if policy_stable:
            break
    return policy, V

# Example MDP
states = [0, 1, 2]
actions = [0, 1]  # 0 = left, 1 = right

# Transition probabilities and rewards
P = {
    0: {
        0: [(1.0, 0)],  # stay in 0
        1: [(1.0, 1)]   # move to 1
    },
    1: {
        0: [(1.0, 0)],
        1: [(1.0, 2)]
    },
    2: {
        0: [(1.0, 1)],
        1: [(1.0, 2)]
    }
}

R = {
    0: {0: {0: 0}, 1: {1: 0}},
    1: {0: {0: 0}, 1: {2: 1}},  # reward for reaching state 2
    2: {0: {1: 0}, 1: {2: 0}}
}

mdp = MDP(states, actions, P, R, gamma=0.9)

# Run Value Iteration
policy_vi, V_vi = value_iteration(mdp)
print("Value Iteration Results")
print("Optimal Policy:", policy_vi)
print("Value Function:", V_vi)

# Run Policy Iteration
policy_pi, V_pi = policy_iteration(mdp)
print("\nPolicy Iteration Results")
print("Optimal Policy:", policy_pi)
print("Value Function:", V_pi)


Value Iteration Results
Optimal Policy: [1 1 0]
Value Function: [4.73683861 5.26315475 4.73683927]

Policy Iteration Results
Optimal Policy: [1 1 0]
Value Function: [4.73683861 5.26315475 4.73683927]
