### **Reinforcement Learning (Grid World)**

In [1]:
import pandas as pd
import numpy as np

### **Environment Setup**

In [2]:
grid_size = 5
actions = ['up', 'down', 'left', 'right']
action_size = len(actions)

Q_qlearning = np.zeros((grid_size, grid_size, action_size))
Q_sarsa = np.zeros((grid_size, grid_size, action_size))


### **Helper Functions**

In [3]:
def get_next_state(state, action):
    x, y = state
    if action == 0 and x > 0: x -= 1
    if action == 1 and x < grid_size - 1: x += 1
    if action == 2 and y > 0: y -= 1
    if action == 3 and y < grid_size - 1: y += 1
    return x, y

def choose_action(Q, state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.randint(action_size)
    return np.argmax(Q[state[0], state[1]])


### Q-**Learning**

In [4]:
alpha = 0.1
gamma = 0.9
epsilon = 0.1
episodes = 500

for _ in range(episodes):
    state = (0, 0)

    while state != (4, 4):
        action = choose_action(Q_qlearning, state, epsilon)
        next_state = get_next_state(state, action)

        reward = 10 if next_state == (4, 4) else -1

        Q_qlearning[state[0], state[1], action] += alpha * (
            reward + gamma * np.max(Q_qlearning[next_state[0], next_state[1]])
            - Q_qlearning[state[0], state[1], action]
        )

        state = next_state


### **SARSA**

In [5]:
for _ in range(episodes):
    state = (0, 0)
    action = choose_action(Q_sarsa, state, epsilon)

    while state != (4, 4):
        next_state = get_next_state(state, action)
        reward = 10 if next_state == (4, 4) else -1
        next_action = choose_action(Q_sarsa, next_state, epsilon)

        Q_sarsa[state[0], state[1], action] += alpha * (
            reward + gamma * Q_sarsa[next_state[0], next_state[1], next_action]
            - Q_sarsa[state[0], state[1], action]
        )

        state = next_state
        action = next_action


### **Policy Comparison**

In [6]:
print("Q-Learning Policy (max action per state):")
print(np.argmax(Q_qlearning, axis=2))

print("\nSARSA Policy (max action per state):")
print(np.argmax(Q_sarsa, axis=2))

Q-Learning Policy (max action per state):
[[3 1 1 1 1]
 [3 3 1 1 1]
 [3 3 3 3 1]
 [2 3 1 3 1]
 [3 3 3 3 0]]

SARSA Policy (max action per state):
[[3 3 1 1 1]
 [1 1 1 1 1]
 [3 3 1 1 1]
 [3 3 3 3 1]
 [1 3 3 3 0]]
