# MLRS2 Exercise - Tabular Q-Learning

In this exercise, you will implement the basic Q-Learning algorithm for a grid world environment.
First, let's install and import the required dependencies.

In [None]:
!pip install numpy
!pip install matplotlib

In [None]:
import numpy as np
import random

import matplotlib.pyplot as plt
import matplotlib.patches as patches

%matplotlib inline

Now we introduce a helper function which visualizes the current Q-table.

In [None]:
def visualize_grid(env, q_table, fig, ax, path=[]):
    v_table = np.max(q_table, axis=-1)
    action = np.argmax(q_table, axis=-1)

    action_symbs = ["^", "v", "<", ">"]

    for x in range(env.size):
        for y in range(env.size):
            if (x, y) == env.goal:
                ax.add_patch(patches.Rectangle((y, env.size - 1 - x), 1, 1, edgecolor='black', facecolor='cyan'))
            elif any(wx <= x < wx + wwidth and wy <= y < wy + wheight for wx, wy, wwidth, wheight in env.walls):
                ax.add_patch(patches.Rectangle((y, env.size - 1 - x), 1, 1, edgecolor='black', facecolor='gray'))
            elif (x, y) in path:
                ax.add_patch(patches.Rectangle((y, env.size - 1 - x), 1, 1, edgecolor='black', facecolor='cyan'))
            else:
                ax.add_patch(patches.Rectangle((y, env.size - 1 - x), 1, 1, edgecolor='black', facecolor='white'))

            plt.text((y + 1/4), env.size - 1 - x + 1/4, str(np.round(v_table[x * env.size + y], 2)), fontsize=16)
            if x < env.size - 1 or y < env.size - 1:
                plt.text((y + 2 / 5), env.size - 1 - x + 1 / 2, action_symbs[action[x * env.size + y]], fontsize=12)

    agent_x, agent_y = env.state
    ax.add_patch(patches.Rectangle((agent_y, env.size - 1 - agent_x), 1, 1, edgecolor='black', facecolor='magenta'))

    plt.show()

### Define Grid World Environment

The `GridWorld` class defines the environment of the agent. The `step` method can be used to perform an action in the environment. The `select_action` method takes the Q-Values for a state as input and performs an epsilon-greedy action selection. With a probability of $\epsilon$ a random action is sampled from the action space and returned, otherwise we select:

$$a_t = \max_a Q(s_t,a)$$



In [None]:
class GridWorld:
    def __init__(self, size=10, epsilon=0.01, walls=[]):
        self.size = size
        self.goal = (0, self.size - 1)
        self.actions = ['up', 'down', 'left', 'right']
        self.action_space = len(self.actions)
        self.walls = walls
        self.epsilon = epsilon

    def reset(self):
        self.state = (self.size - 1, 0)

        return self.state

    def is_wall(self, x, y):
        for (wx, wy, wwidth, wheight) in self.walls:
            if wx <= x < wx + wwidth and wy <= y < wy + wheight:
                return True
        return False

    def step(self, action):
        x, y = self.state
        if action == 0 and x > 0 and not self.is_wall(x - 1, y):  # up
            x -= 1
        elif action == 1 and x < self.size - 1 and not self.is_wall(x + 1, y):  # down
            x += 1
        elif action == 2 and y > 0 and not self.is_wall(x, y - 1):  # left
            y -= 1
        elif action == 3 and y < self.size - 1 and not self.is_wall(x, y + 1):  # right
            y += 1

        self.state = (x, y)
        if self.state == self.goal:
            return self.state, 1, True  # Reward of 1 for reaching the goal
        else:
            return self.state, -0.01, False  # Small negative reward to encourage faster solutions

    def get_state_index(self, state):
        x, y = state
        return x * self.size + y

    def select_action(self, q_values):
        #####################################SOLUTION#######################################################################
        # Epsilon greedy selection of action
        if random.uniform(0, 1) < self.epsilon:
            action = random.choice(range(self.action_space))
        else:
            action = np.argmax(q_values).item()
        #####################################SOLUTION#######################################################################
        ############ TODO #############
        # Implement epsilon-greedy action selection
        ############ TODO #############
        
        return action

In [None]:
# Hyperparameters
gamma = 0.99
epsilon = 0.1
learning_rate = 0.1
num_episodes = 1000

Now, define the walls within the grid world and setup the environment.

In [None]:
#################################SOLUTION###################################################################################
# Define walls within the gridworld
walls = [(1, 1, 1, 3), (8, 8, 2, 1), (2, 6, 3, 2), (7, 2, 1, 2)]

# Define grid world environment
env = GridWorld(size=10, epsilon=epsilon, walls=walls)
##################################SOLUTION##################################################################################
######## TODO #########
# Define walls within the gridworld
#walls = []

# Define grid world environment
#env = ...
######## TODO #########

Initialize the Q-table with zeros. For each state in the environment, the Q-table will hold values for all actions of the action space in that state.

In [None]:
# Q-Learning Training
################################SOLUTION####################################################################################
# Define Q-table
q_table = np.zeros((env.size * env.size, env.action_space))
################################SOLUTION####################################################################################
######## TODO #########
# Define Q-table
#q_table = ...
######## TODO #########

### Q-Learning Algorithm Training and Testing

Implement the training loop for Q-learning.

In [None]:
# Track rewards and loss
rewards = []
losses = []

for episode in range(num_episodes):
    # Reset environment
    state = env.reset()

    done = False
    episode_steps = 0
    episode_reward = 0
    episode_loss = 0

    while not done:
        #########################################SOLUTION###################################################################
        # Get state index
        state_index = env.get_state_index(state)

        # select action
        action = env.select_action(q_table[state_index])

        # Environment step
        next_state, reward, done = env.step(action)
        next_state_index = env.get_state_index(next_state)

        # Q-learning update
        q_target = reward + gamma * np.max(q_table[next_state_index]).item()
        q_table[state_index][action] = q_table[state_index][action] + learning_rate * (q_target - q_table[state_index][action])

        # Calculate q loss
        q_loss = (q_target - q_table[state_index][action]) ** 2

        # Transition to next state
        state = next_state
        ###########################################SOLUTION#################################################################
        ############ TODO #############
        # Get state index

        # select action

        # Environment step

        # Q-learning update

        # Calculate q loss

        # Transition to next state

        ############ TODO #############
        episode_reward += reward
        episode_loss += q_loss.item()
        episode_steps += 1

    episode_reward /= episode_steps
    episode_loss /= episode_steps

    rewards.append(episode_reward)
    losses.append(episode_loss)

    if episode % 100 == 0:
        print(f"Episode: {episode:<3}, Episode Loss: {episode_loss:.4f}, Episode Reward: {episode_reward:.4f}")

### Test the agent

Test the trained agent within the grid world to find the optimal sequence of actions to get from the start to the goal. During testing, we don't need to explore with the epsilon-greedy policy but can greedily choose the action with the highest Q-value estimate in each step.

In [None]:
def test_agent(env, q_table):
    state = env.reset()
    done = False
    steps = 0
    total_reward = 0
    path = [state]

    fig, ax = plt.subplots(figsize=(12, 12))
    ax.set_xlim(0, env.size)
    ax.set_ylim(0, env.size)
    ax.set_xticks(range(env.size + 1))
    ax.set_yticks(range(env.size + 1))
    ax.grid(True)

    while not done:
        ############################################SOLUTION################################################################
        # Get state index
        state_index = env.get_state_index(state)

        # Greedily select action
        action = np.argmax(q_table[state_index]).item()

        # Perform environment step
        next_state, reward, done = env.step(action)

        # Transition to next state
        state = next_state
        #############################################SOLUTION###############################################################
        ############ TODO #############
        # Get state index

        # Greedily select action

        # Perform environment step

        # Transition to next state
        ############ TODO ##############
        path.append(state)
        

        total_reward += reward
        steps += 1

    # Visualize grid
    visualize_grid(env, q_table, fig, ax, path)

    total_reward /= steps
    print(f"Reached the goal in {steps} steps with a total reward of {total_reward:.4f}.")


In [None]:
### Test the trained agent and visualize its path
test_agent(env, q_table)

Finally, we plot the recorded data of the training.

In [None]:
# Create a figure with two subplots arranged horizontally
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

# Plot data in the first subplot
ax1.plot(range(num_episodes), rewards)
ax1.set_title('Reward over Episodes')
ax1.set_xlabel('Episode')
ax1.set_ylabel('Episode Reward')

# Plot data in the second subplot
ax2.plot(range(num_episodes), losses)
ax2.set_title('Q-Loss over Episodes')
ax2.set_xlabel('Episode')
ax2.set_ylabel('Episode Q-Loss')

# Display the plot
plt.tight_layout()
plt.show()
