# Q-Learning
- Max(state,action)

```
max_new_state = np.max(q_table[new_state,:])

q_table[state, action] = (1 - alpha) * q_table[state, action] + alpha * (reward + gamma * new_state_max - q_table[state, action])
```

# SARSA -> (State,Action,Reward,State2,Action2)
- Next Action

```
sarsa_new_state = q_table(new_state,action2)

q_table[state, action] = (1 - alpha) * q_table[state, action] + alpha * (reward + gamma * sarsa_new_state - q_table[state, action])
```

# Q-Learning
### OFF Policy: In this, the Learning agent learns the value function according to the action derived from another Policy

# SARSA -> State-Action-Reward-State2-Action2
### ON Policy: In this, the Learning agent learns the value function according to the current action derived from the  Policy currently being used

# SARSA VS Q-LEARNING

| SARSA (On Policy)                    | Q-Learning (Off Policy)                |
| ------------------------------------ | -------------------------------------- |
| Slow                                 | Fast                                   |
| Relatively less risk of falling      | Relatively high risk of falling        |
| Use if cost of error is quite high   | Use if cost of error is not verry high |
| EX: Self-driving (the error is high) | EX: Chess (the error is low)           |
| Increase in Episodes will increase the accuracy of sarsa as its On Policy Learning | Increase in Episodes will also increase the accuracy of Q-Learning but after a point it will saturate |


In [27]:
import numpy as np
import gymnasium as gym
import random

In [28]:
env = gym.make("FrozenLake-v1", is_slippery=False,render_mode="ansi")

In [29]:
action_space_size =  env.action_space.n
state_space_size = env.observation_space.n 

In [30]:
q_table = np.zeros((state_space_size, action_space_size))

In [31]:
total_episodes = 20000
learning_rate = 0.2
max_steps = 100
gamma = 0.99

epsilon = 1
max_epsilon = 1
min_epsilon = 0.01
decay_rate = 0.001

In [32]:
rewards = []

for episode in range(total_episodes):
    state, _ = env.reset()
    step = 0
    done = False
    total_rewards = 0

    for step in range(max_steps):
        if random.uniform(0, 1) > epsilon:
            action = np.argmax(q_table[state, :])  # Exploit
        else:
            action = env.action_space.sample()  # Explore

        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        if random.uniform(0, 1) > epsilon:
            new_action = np.argmax(q_table[new_state, :])  # Exploit
        else:
            new_action = env.action_space.sample()  # Explore

        sarsa_new_state = np.max(q_table[new_state, new_action])

        q_table[state, action] = q_table[state, action] + learning_rate * (
            reward + gamma * sarsa_new_state - q_table[state, action]
        )
        total_rewards += reward

        state = new_state
        if done:
            break

    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    rewards.append(total_rewards)

print("Score:", str(sum(rewards) / total_episodes))
print(q_table)

Score: 0.9193
[[7.34887414e-01 9.46759887e-01 6.83664039e-01 6.84816585e-01]
 [6.83598065e-01 0.00000000e+00 5.42117648e-01 5.45481798e-01]
 [6.70929601e-01 8.11847235e-02 5.28655322e-02 8.09794681e-02]
 [1.49778143e-01 0.00000000e+00 3.77530996e-05 9.12362724e-06]
 [7.73715437e-01 9.59741089e-01 0.00000000e+00 7.73314991e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 9.80073426e-01 0.00000000e+00 2.74639858e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [7.75284392e-01 0.00000000e+00 9.69944408e-01 8.06340670e-01]
 [9.00493251e-01 7.84080000e-01 9.79992679e-01 0.00000000e+00]
 [9.70076887e-01 9.89983129e-01 0.00000000e+00 9.70131067e-01]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [0.00000000e+00 9.79866798e-01 9.90000000e-01 9.64623434e-01]
 [9.80085699e-01 9.89930335e-01 1.00000000e+00 9.80048462e-01]
 [0.00000000e+00 0.00000000e+00 0.0000000

# AI GENERATED

In [33]:
rewards = []

for episode in range(total_episodes):
    state, _ = env.reset()
    step = 0
    done = False
    total_rewards = 0

    # Choose action from state using epsilon-greedy policy
    if random.uniform(0, 1) > epsilon:
        action = np.argmax(q_table[state, :])  # Exploit
    else:
        action = env.action_space.sample()  # Explore

    for step in range(max_steps):
        new_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        # Choose next action from new_state using epsilon-greedy policy
        if random.uniform(0, 1) > epsilon:
            new_action = np.argmax(q_table[new_state, :])  # Exploit
        else:
            new_action = env.action_space.sample()  # Explore

        # SARSA update rule
        q_table[state, action] = q_table[state, action] + learning_rate * (
            reward + gamma * q_table[new_state, new_action] - q_table[state, action]
        )
        total_rewards += reward

        state = new_state
        action = new_action  # Update action to the new action
        if done:
            break

    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    rewards.append(total_rewards)

print("Score:", str(sum(rewards) / total_episodes))
print(q_table)

Score: 0.9228
[[0.73661021 0.89355217 0.75394609 0.73725602]
 [0.87716641 0.         0.74896271 0.74873778]
 [0.74967779 0.79421415 0.74985245 0.74809438]
 [0.74964372 0.         0.49552629 0.57573266]
 [0.82658133 0.90257795 0.         0.77851939]
 [0.         0.         0.         0.        ]
 [0.         0.98004468 0.         0.83413197]
 [0.         0.         0.         0.        ]
 [0.72456904 0.         0.94425273 0.72263016]
 [0.69852575 0.9801     0.78243454 0.        ]
 [0.96668349 0.98900533 0.         0.96655416]
 [0.         0.         0.         0.        ]
 [0.         0.         0.         0.        ]
 [0.         0.97157932 0.99       0.9419982 ]
 [0.97949867 0.99       1.         0.97510874]
 [0.         0.         0.         0.        ]]
