<a href="https://colab.research.google.com/github/harsh21CSU182/Harsh-Kaushik-RL/blob/main/RL_PRACTICAL_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import gym

def value_iteration(env, gamma=0.9, epsilon=1e-6):
    num_states = env.observation_space.n
    num_actions = env.action_space.n

    # Initialize value function arbitrarily
    V = np.zeros(num_states)

    while True:
        delta = 0

        for state in range(num_states):
            v = V[state]

            # Update the value function using the Bellman optimality equation
            V[state] = max([sum(p * (r + gamma * V[next_state])
                                for p, next_state, r, _ in env.P[state][action])
                            for action in range(num_actions)])

            # Update the maximum change in value
            delta = max(delta, abs(v - V[state]))

        # Check for convergence
        if delta < epsilon:
            break

    # Extract the optimal policy based on the computed value function
    optimal_policy = np.zeros(num_states, dtype=int)
    for state in range(num_states):
        optimal_policy[state] = np.argmax([sum(p * (r + gamma * V[next_state])
                                               for p, next_state, r, _ in env.P[state][action])
                                           for action in range(num_actions)])

    return optimal_policy, V

# Create a simple grid world environment using gym
env = gym.make("FrozenLake-v1")

# Run value iteration
optimal_policy, optimal_value_function = value_iteration(env)

# Display results
print("\nOptimal Policy:")
print(optimal_policy.reshape((4, 4)))  # Assuming a 4x4 grid for FrozenLake

print("\nOptimal Value Function:")
print(optimal_value_function.reshape((4, 4)))  # Assuming a 4x4 grid for FrozenLake



Optimal Policy:
[[0 3 0 3]
 [0 0 0 0]
 [3 1 0 0]
 [0 2 1 0]]

Optimal Value Function:
[[0.06888624 0.06141117 0.07440763 0.05580502]
 [0.09185097 0.         0.11220727 0.        ]
 [0.14543392 0.24749561 0.29961676 0.        ]
 [0.         0.37993504 0.63901974 0.        ]]


  deprecation(
  deprecation(


In [None]:
import numpy as np

def value_iteration(states, actions, transitions, rewards, gamma=0.9, epsilon=1e-6, max_iterations=1000):
    num_states = len(states)
    num_actions = len(actions)

    V = np.zeros(num_states)

    for _ in range(max_iterations):
        prev_V = np.copy(V)

        for s in range(num_states):
            Q_values = [sum(transitions[s, a, s_prime] * (rewards[s, a, s_prime] + gamma * V[s_prime])
                            for s_prime in range(num_states)) for a in range(num_actions)]

            V[s] = max(Q_values)

        if np.max(np.abs(V - prev_V)) < epsilon:
            break

    policy = [np.argmax([sum(transitions[s, a, s_prime] * (rewards[s, a, s_prime] + gamma * V[s_prime])
                              for s_prime in range(num_states)) for a in range(num_actions)])
              for s in range(num_states)]

    return V, policy

# Example usage:
states = [0, 1, 2, 3]
actions = [0, 1]
transitions = np.array([[[0.5, 0.5, 0, 0], [0.7, 0.3, 0, 0]],
                        [[0, 0.8, 0.2, 0], [0, 0, 1, 0]],
                        [[0, 0, 0.4, 0.6], [0, 0, 0, 1]],
                        [[0, 0, 0, 1], [0, 0, 0, 1]]])
rewards = np.array([[[1, -1, 0, 0], [2, 0, 0, 0]],
                    [[0, -1, 0, 0], [0, 0, 0, 0]],
                    [[0, 0, 5, -1], [0, 0, 0, -1]],
                    [[0, 0, 0, 10], [0, 0, 0, 10]]])

optimal_value_function, optimal_policy = value_iteration(states, actions, transitions, rewards)
print("Optimal Value Function:", optimal_value_function)
print("Optimal Policy:", optimal_policy)

Optimal Value Function: [65.53635465 80.09999102 88.99999102 99.99999102]
Optimal Policy: [0, 1, 1, 0]
