In [1]:
import gym
import numpy as np
import random
from collections import defaultdict

env = gym.make("FrozenLake-v1", is_slippery=True)  # stochastic version

# Hyperparameters
alpha = 0.1
gamma = 0.99
epsilon = 0.1
episodes = 10000
max_steps = 100

def epsilon_greedy(Q, state, n_actions):
    if random.random() < epsilon:
        return random.randint(0, n_actions - 1)
    else:
        return np.argmax(Q[state])

# Create Q-table
def create_Q():
    return defaultdict(lambda: np.zeros(env.action_space.n))

# --- SARSA ---
def train_sarsa():
    Q = create_Q()
    for ep in range(episodes):
        state = env.reset()[0]
        action = epsilon_greedy(Q, state, env.action_space.n)

        for _ in range(max_steps):
            next_state, reward, done, _, _ = env.step(action)
            next_action = epsilon_greedy(Q, next_state, env.action_space.n)

            td_target = reward + gamma * Q[next_state][next_action]
            td_error = td_target - Q[state][action]
            Q[state][action] += alpha * td_error

            state, action = next_state, next_action
            if done:
                break
    return Q

# --- Q-learning ---
def train_q_learning():
    Q = create_Q()
    for ep in range(episodes):
        state = env.reset()[0]

        for _ in range(max_steps):
            action = epsilon_greedy(Q, state, env.action_space.n)
            next_state, reward, done, _, _ = env.step(action)

            best_next = np.max(Q[next_state])
            td_target = reward + gamma * best_next
            td_error = td_target - Q[state][action]
            Q[state][action] += alpha * td_error

            state = next_state
            if done:
                break
    return Q

# Evaluate policy
def evaluate(Q):
    total_reward = 0
    episodes_eval = 100
    for _ in range(episodes_eval):
        state = env.reset()[0]
        done = False
        while not done:
            action = np.argmax(Q[state])
            state, reward, done, _, _ = env.step(action)
            total_reward += reward
    return total_reward / episodes_eval

# Run both
Q_sarsa = train_sarsa()
Q_qlearn = train_q_learning()

print("SARSA average success rate:", evaluate(Q_sarsa))
print("Q-learning average success rate:", evaluate(Q_qlearn))


  if not isinstance(terminated, (bool, np.bool8)):


SARSA average success rate: 0.0
Q-learning average success rate: 0.83
