**Q-LEARNING ON FROZEN LAKE (Deterministic Environment)**

In [1]:
!pip install gymnasium numpy matplotlib --quiet


Import Required Libraries

In [3]:
import numpy as np
import gym
import random

Create Environment

In [4]:
# FrozenLake-v1 has a 4x4 grid world:
# S - Start, F - Frozen (safe), H - Hole (danger), G - Goal
# Setting is_slippery=False makes it deterministic (no random slips)
env = gym.make("FrozenLake-v1", is_slippery=False)

Initialize State & Action Space

In [5]:
action_space_size = env.action_space.n       # Number of possible actions (4)
state_space_size = env.observation_space.n   # Number of possible states (16)

Initialize Q-Table

In [6]:
# A Q-table stores the value of each state‚Äìaction pair
# Rows = states, Columns = actions
qtable = np.zeros((state_space_size, action_space_size))

Define Hyperparameters

In [14]:
total_episodes = 5000     # Total number of training episodes
max_steps = 100           # Max steps per episode

learning_rate = 0.8       # Alpha ‚Äì learning rate
gamma = 0.95              # Gamma ‚Äì discount factor

Exploration Parameters

In [15]:
epsilon = 1.0             # Exploration rate
max_epsilon = 1.0         # Max exploration rate
min_epsilon = 0.01        # Min exploration rate
decay_rate = 0.0005       # Exponential decay rate for epsilon

Training Preparation

In [16]:
rewards = []              # Track rewards per episode

Q-LEARNING TRAINING LOOP

In [18]:
if not hasattr(np, "bool8"):  # Compatibility for NumPy 2.0+
    np.bool8 = np.bool_
for episode in range(total_episodes):
    # Reset the environment at the start of each episode
    state_return = env.reset()
    state = state_return[0] if isinstance(state_return, tuple) else state_return

    done = False
    total_rewards = 0

    for step in range(max_steps):
        # -----------------------------
        # üîπ Exploration‚ÄìExploitation Trade-off
        # -----------------------------
        if random.uniform(0, 1) > epsilon:
            # Exploitation: choose best action from Q-table
            action = np.argmax(qtable[state, :])
        else:
            # Exploration: choose a random action
            action = env.action_space.sample()

        # -----------------------------
        # üîπ Perform Action & Observe Result
        # -----------------------------
        step_result = env.step(action)
        if len(step_result) == 5:
            new_state, reward, done, truncated, info = step_result
        else:
            new_state, reward, done, info = step_result
            truncated = False

        # -----------------------------
        # üîπ Reward Shaping
        # -----------------------------
        # Give higher reward for reaching goal to encourage success
        if reward == 1:
            reward = 10

        # -----------------------------
        # üîπ Update Q-Table (Bellman Equation)
        # -----------------------------
        qtable[state, action] = qtable[state, action] + learning_rate * (
            reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action]
        )

        total_rewards += reward
        state = new_state

        # Stop episode if goal or hole reached
        if done or truncated:
            break

    # -----------------------------
    # üîπ Epsilon Decay (reduces exploration over time)
    # -----------------------------
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

    # Store total rewards for analysis
    rewards.append(total_rewards)

TRAINING RESULTS

In [17]:

average_score = sum(rewards) / total_episodes
print("===============================================================")
print(f"Training Complete! Average Score over time: {average_score:.5f}")
print("===============================================================")
print("Final Q-Table (State-Action Values):")
print(np.array2string(qtable, formatter={'float_kind': lambda x: f'{x:.8f}'}))
print("===============================================================")


Training Complete! Average Score over time: 0.00000
Final Q-Table (State-Action Values):
[[7.35091891 7.73780937 7.73780937 7.35091891]
 [7.35091891 0.00000000 8.14506250 7.73780937]
 [7.73780937 8.57375000 7.73780937 8.14506250]
 [8.14506250 0.00000000 7.73780937 7.73780937]
 [7.73780937 8.14506250 0.00000000 7.35091891]
 [0.00000000 0.00000000 0.00000000 0.00000000]
 [0.00000000 9.02500000 0.00000000 8.14506250]
 [0.00000000 0.00000000 0.00000000 0.00000000]
 [8.14506250 0.00000000 8.57375000 7.73780937]
 [8.14506250 9.02500000 9.02500000 0.00000000]
 [8.57375000 9.50000000 0.00000000 8.57375000]
 [0.00000000 0.00000000 0.00000000 0.00000000]
 [0.00000000 0.00000000 0.00000000 0.00000000]
 [0.00000000 9.02500000 9.50000000 8.57375000]
 [9.02500000 9.50000000 10.00000000 9.02500000]
 [0.00000000 0.00000000 0.00000000 0.00000000]]


In [30]:

print("\nüéÆ Testing the trained agent...\n")

for episode in range(5):
    state_return = env.reset()
    if isinstance(state_return, tuple):
        state = state_return[0]
    else:
        state = state_return

    print("****************************************************")
    print(f"EPISODE {episode + 1}")
    step = 0
    done = False

    for step in range(max_steps):
        # Choose best action from Q-table
        action = np.argmax(qtable[state, :])
        new_state, reward, done, info = env.step(action)

        # Uncomment below line to render (optional visualization)
        # env.render()

        if done:
            if new_state == 15:
                print("üèÜ We reached our Goal!")
            else:
                print("‚ò†Ô∏è We fell into a hole!")
            print("Number of steps:", step)
            break

        state = new_state

env.close()



üéÆ Testing the trained agent...

****************************************************
EPISODE 1
üèÜ We reached our Goal!
Number of steps: 5
****************************************************
EPISODE 2
üèÜ We reached our Goal!
Number of steps: 5
****************************************************
EPISODE 3
üèÜ We reached our Goal!
Number of steps: 5
****************************************************
EPISODE 4
üèÜ We reached our Goal!
Number of steps: 5
****************************************************
EPISODE 5
üèÜ We reached our Goal!
Number of steps: 5
