<a href="https://colab.research.google.com/github/hariniiy/RL_1796/blob/main/ASGN_4_1796.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import gymnasium as gym
import numpy as np
from collections import defaultdict

# ---------------------------
# Q-Learning
# ---------------------------
def q_learning(env, num_episodes=50000, alpha=0.1, gamma=0.99, epsilon=0.1):
    """
    Q-Learning algorithm for discrete action space environments.
    """
    Q = defaultdict(lambda: np.zeros(env.action_space.n))

    def epsilon_greedy(state):
        if np.random.rand() < epsilon:
            return np.random.choice(env.action_space.n)
        else:
            return np.argmax(Q[state])

    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False

        while not done:
            action = epsilon_greedy(state)
            next_state, reward, terminated, truncated, _ = env.step(action)

            # Q-learning update (off-policy)
            Q[state][action] = Q[state][action] + alpha * (
                reward + gamma * np.max(Q[next_state]) - Q[state][action]
            )

            state = next_state
            done = terminated or truncated

    # Derive final greedy policy
    policy = {}
    for state in Q:
        policy[state] = np.argmax(Q[state])

    return policy, Q

# ---------------------------
# Run Example with FrozenLake
# ---------------------------
env = gym.make("FrozenLake-v1", is_slippery=True)

# Train with Q-learning
optimal_policy, Q = q_learning(env, num_episodes=100000, alpha=0.1, gamma=0.99, epsilon=0.1)

print("Q-Learning Optimal Policy (sample states):")
for k, v in list(optimal_policy.items())[:10]:
    print(f"State: {k}, Best Action: {v}")

# Optional: Evaluate learned policy
def evaluate_policy(policy, env, num_episodes=1000):
    total_rewards = 0
    for _ in range(num_episodes):
        state, _ = env.reset()
        done = False
        while not done:
            action = policy.get(state, np.random.choice(env.action_space.n))
            state, reward, terminated, truncated, _ = env.step(action)
            total_rewards += reward
            done = terminated or truncated
    return total_rewards / num_episodes

avg_reward = evaluate_policy(optimal_policy, env)
print("\nAverage Reward over 1000 episodes:", avg_reward)
