<a href="https://colab.research.google.com/github/harsh21CSU182/Harsh-Kaushik-RL/blob/main/RL_EXP_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

states = [0, 1, 2, 3, 4]
actions = [0, 1]
N_STATES = len(states)
N_ACTIONS = len(actions)
P = np.zeros((N_STATES, N_ACTIONS, N_STATES))  # transition probability
R = np.zeros((N_STATES, N_ACTIONS, N_STATES))  # rewards
P[0, 0, 1] = 1.0  # STATE ACTION and Reward
P[1, 1, 2] = 1.0
P[2, 0, 3] = 1.0
P[3, 1, 4] = 1.0
P[4, 0, 4] = 1.0

R[0, 0, 1] = 1
R[1, 1, 2] = 10
R[2, 0, 3] = 100
R[3, 1, 4] = 1000
R[4, 0, 4] = 1.0
gamma = 0.75

# initialize policy and value arbitrarily

policy = [0 for s in range(N_STATES)]
V = np.zeros(N_STATES)

print("Initial policy", policy)
# print V
# Initial policy [0, 0, 0, 0, 0]
is_value_changed = True
iterations = 0

while is_value_changed:
    is_value_changed = False
    iterations += 1
    # run value iteration for each state
    for s in range(N_STATES):
        q_best = V[s]
        for a in range(N_ACTIONS):
            q_sa = sum([P[s, a, s1] * (R[s, a, s1] + gamma * V[s1]) for s1 in range(N_STATES)])
            if q_sa > q_best:
                policy[s] = a
                q_best = q_sa
                is_value_changed = True
        V[s] = q_best

print("Iterations", iterations)
print("Final Policy")
print(policy)
print(V)


Initial policy [0, 0, 0, 0, 0]
Iterations 126
Final Policy
[0, 1, 0, 1, 0]
[ 487.890625  649.1875    852.25     1003.          4.      ]


In [7]:
import numpy as np
import gym

def policy_evaluation(env, policy, gamma=0.9, epsilon=1e-6):
    num_states = env.observation_space.n
    num_actions = env.action_space.n

    # Initialize value function arbitrarily
    V = np.zeros(num_states)

    while True:
        delta = 0
        for state in range(num_states):
            v = V[state]
            action = policy[state]

            # Calculate the expected value for the current state and action
            expected_value = sum(p * (r + gamma * V[next_state])
                                 for p, next_state, r, _ in env.P[state][action])

            # Update the value function for the current state
            V[state] = expected_value

            # Update the maximum change in value
            delta = max(delta, abs(v - V[state]))

        # Check for convergence
        if delta < epsilon:
            break

    return V

def policy_improvement(env, V, gamma=0.9):
    num_states = env.observation_space.n
    num_actions = env.action_space.n

    policy_stable = True
    new_policy = np.zeros(num_states, dtype=int)

    for state in range(num_states):
        old_action = new_policy[state]

        # Find the action that maximizes the expected value
        new_policy[state] = np.argmax([sum(p * (r + gamma * V[next_state])
                                           for p, next_state, r, _ in env.P[state][action])
                                       for action in range(num_actions)])

        # Check if the policy for the current state has changed
        if old_action != new_policy[state]:
            policy_stable = False

    return new_policy, policy_stable

def policy_iteration(env, gamma=0.9, max_iterations=1000):
    num_states = env.observation_space.n
    num_actions = env.action_space.n

    # Initialize a random policy
    initial_policy = np.random.randint(0, num_actions, size=num_states)
    policy = initial_policy.copy()

    for i in range(max_iterations):
        # Policy Evaluation
        V = policy_evaluation(env, policy, gamma)

        # Policy Improvement
        new_policy, policy_stable = policy_improvement(env, V, gamma)

        # Update the policy
        policy = new_policy

        # Check for policy stability
        if policy_stable:
            print(f"Policy Iteration converged in {i+1} iterations.")
            break

    return initial_policy, policy, V

# Create a simple grid world environment using gym
env = gym.make("FrozenLake-v1")

# Run policy iteration
initial_policy, optimal_policy, optimal_value_function = policy_iteration(env)

# Display results
print("\nInitial Policy:")
print(initial_policy.reshape((4, 4)))  # Assuming a 4x4 grid for FrozenLake

print("\nOptimal Policy:")
print(optimal_policy.reshape((4, 4)))  # Assuming a 4x4 grid for FrozenLake

print("\nOptimal Value Function:")
print(optimal_value_function.reshape((4, 4)))  # Assuming a 4x4 grid for FrozenLake



Initial Policy:
[[0 0 1 0]
 [0 1 3 3]
 [0 1 1 0]
 [3 2 3 3]]

Optimal Policy:
[[0 3 0 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]

Optimal Value Function:
[[0.06888673 0.06141154 0.07440786 0.05580526]
 [0.09185135 0.         0.11220737 0.        ]
 [0.14543417 0.24749575 0.29961685 0.        ]
 [0.         0.37993513 0.63901979 0.        ]]
