<a href="https://colab.research.google.com/github/hariniiy/RL_1796/blob/main/Untitled34.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import numpy as np
import random
import gymnasium as gym

def q_learning(env, name="Env", episodes=5000, max_steps=100,
               alpha=0.8, gamma=0.95,
               epsilon=1.0, min_epsilon=0.01, decay_rate=0.005,
               render=False):
    """
    Q-learning for discrete-action Gymnasium environments.
    Handles updated API (reset returns tuple, step returns terminated/truncated).
    """
    state_size = env.observation_space.n
    action_size = env.action_space.n
    Q = np.zeros((state_size, action_size))

    def get_epsilon(ep):
        return min_epsilon + (epsilon - min_epsilon) * np.exp(-decay_rate * ep)

    for ep in range(episodes):
        state, _ = env.reset()
        done = False
        total_reward = 0
        eps = get_epsilon(ep)

        for _ in range(max_steps):
            if random.random() < eps:
                action = env.action_space.sample()
            else:
                action = np.argmax(Q[state])

            new_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            # Q-learning update
            Q[state, action] += alpha * (
                reward + gamma * np.max(Q[new_state]) - Q[state, action]
            )
            total_reward += reward
            state = new_state

            if done:
                break

        # Optionally, print episode-by-episode reward
        # if (ep + 1) % (episodes // 5) == 0:
        #     print(f"{name}: Episode {ep+1}/{episodes}, reward: {total_reward:.2f}")

    if render:
        print(f"\n{name}: Testing learned policy")
        for _ in range(3):
            state, _ = env.reset()
            done = False
            env.render()
            while not done:
                action = np.argmax(Q[state])
                state, reward, terminated, truncated, _ = env.step(action)
                done = terminated or truncated
                env.render()
            print("Episode reward:", reward)
    return Q

if __name__ == "__main__":
    # FrozenLake (deterministic)
    print("=== Training on FrozenLake-v1 (deterministic) ===")
    env1 = gym.make("FrozenLake-v1", is_slippery=False, render_mode="ansi")
    Q1 = q_learning(env1, name="FrozenLake", episodes=10000, alpha=0.7, gamma=0.95,
                    epsilon=1.0, min_epsilon=0.01, decay_rate=0.001, render=True)
    env1.close()

    # Taxi-v3
    print("\n=== Training on Taxi-v3 ===")
    env2 = gym.make("Taxi-v3", render_mode="ansi")
    Q2 = q_learning(env2, name="Taxi-v3", episodes=20000, alpha=0.9, gamma=0.95,
                    epsilon=1.0, min_epsilon=0.01, decay_rate=0.0005, render=True)
    env2.close()



=== Training on FrozenLake-v1 (deterministic) ===


  return datetime.utcnow().replace(tzinfo=utc)



FrozenLake: Testing learned policy
Episode reward: 1.0
Episode reward: 1.0
Episode reward: 1.0

=== Training on Taxi-v3 ===

Taxi-v3: Testing learned policy
Episode reward: 20
Episode reward: 20
Episode reward: 20
