<h1>Q-Learning: From Theory to Implementation.</h1>

In this notebook we introduce the fundamental Q-learning algorithm, one of the core methods in Reinforcement Learning (RL)

In [14]:
import numpy as np
import random
import tqdm
from tqdm import tqdm

In [15]:
class GridWorld:
    def __init__(self, size=5):
        self.size = size
        self.start = (0, 0)
        self.goal = (size-1, size-1)
        self.state = None
        
        self.action_space = [0, 1, 2, 3]
        self.n_actions = len(self.action_space)

    def to_index(self, state):
        x, y = state
        return x * self.size + y

    def reset(self):
        self.state = self.start
        return self.to_index(self.state)


    def step(self, action):
        x, y = self.state
    
        if action == 0:  # up
            x = max(0, x - 1)
        elif action == 1:  # down
            x = min(self.size - 1, x + 1)
        elif action == 2:  # left
            y = max(0, y - 1)
        elif action == 3:  # right
            y = min(self.size - 1, y + 1)
    
        new_state = (x, y)
        self.state = new_state
        state_index = self.to_index(new_state)
    
        terminated = (new_state == self.goal)
        reward = 1 if terminated else 0
        truncated = False
    
        return state_index, reward, terminated, truncated

    def render(self):
        grid = np.zeros((self.size, self.size), dtype=str)
        grid[:] = "."
        
        x, y = self.state
        gx, gy = self.goal
        
        grid[x, y] = "A"
        grid[gx, gy] = "G"
        
        print("\n".join(" ".join(row) for row in grid))
        print()


In [16]:
env = GridWorld()
state = env.reset()
env.render()

A . . . .
. . . . .
. . . . .
. . . . .
. . . . G



In [17]:
def initialize_q_table(state_space, action_space):
  Qtable = np.zeros((state_space, action_space))
  return Qtable

In [18]:
state_space = env.size * env.size
action_space = env.n_actions

Qtable_gridworld = initialize_q_table(state_space, action_space)

In [19]:
def greedy_policy(Qtable, state):
  action = np.argmax(Qtable[state][:])

  return action

In [20]:
def epsilon_greedy_policy(Qtable, state, epsilon):
  random_num = random.uniform(0,1)
  if random_num > epsilon:
    action = greedy_policy(Qtable, state)
  else:
    action = random.choice(env.action_space)

  return action

In [21]:
# Training parameters
n_training_episodes = 5000  
learning_rate = 0.7       

# Evaluation parameters
n_eval_episodes = 100       

# Environment parameters   
max_steps = 99               
gamma = 0.95                 
eval_seed = []               

# Exploration parameters
max_epsilon = 1.0            
min_epsilon = 0.05           
decay_rate = 0.0005        

In [22]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
  for episode in tqdm(range(n_training_episodes)):
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    state = env.reset()
    step = 0
    terminated = False
    truncated = False

    for step in range(max_steps):
      action = epsilon_greedy_policy(Qtable, state, epsilon)

      new_state_index, reward, terminated, truncated = env.step(action)

      Qtable[state][action] += learning_rate * (reward + gamma * np.max(Qtable[new_state_index]) - Qtable[state][action])

      if terminated or truncated:
        break

      state = new_state_index
  return Qtable

In [23]:
Qtable_gridworld = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable_gridworld)

100%|██████████| 5000/5000 [00:01<00:00, 3978.22it/s]


In [24]:
Qtable_gridworld

array([[0.66342043, 0.6983373 , 0.66342043, 0.6983373 ],
       [0.6983373 , 0.73509189, 0.66342043, 0.73509189],
       [0.73509189, 0.77378094, 0.6983373 , 0.77378094],
       [0.77378094, 0.81450625, 0.73509189, 0.81450625],
       [0.81450625, 0.857375  , 0.77378094, 0.81450625],
       [0.66342043, 0.73509189, 0.6983373 , 0.73509189],
       [0.6983373 , 0.77378094, 0.6983373 , 0.77378094],
       [0.73509189, 0.81450625, 0.73509189, 0.81450625],
       [0.77378094, 0.857375  , 0.77378094, 0.857375  ],
       [0.81450625, 0.9025    , 0.81450625, 0.857375  ],
       [0.6983373 , 0.77378094, 0.73509189, 0.77378094],
       [0.73509189, 0.81450625, 0.73509189, 0.81450625],
       [0.77378094, 0.857375  , 0.77378094, 0.857375  ],
       [0.81450625, 0.9025    , 0.81450625, 0.9025    ],
       [0.857375  , 0.95      , 0.857375  , 0.9025    ],
       [0.73509189, 0.81450625, 0.77378094, 0.81450625],
       [0.77378094, 0.857375  , 0.77378094, 0.857375  ],
       [0.81450625, 0.9025    ,

In [25]:
def evaluate_agent(env, max_steps, n_eval_episodes, Q, seed):
  
  episode_rewards = []
  for episode in tqdm(range(n_eval_episodes)):
    if seed:
      state = env.reset(seed=seed[episode])
    else:
      state = env.reset()
    step = 0
    truncated = False
    terminated = False
    total_rewards_ep = 0

    for step in range(max_steps):
      action = greedy_policy(Q, state)
      new_state, reward, terminated, truncated = env.step(action)
      total_rewards_ep += reward

      if terminated or truncated:
        break
      state = new_state
    episode_rewards.append(total_rewards_ep)
  mean_reward = np.mean(episode_rewards)
  std_reward = np.std(episode_rewards)

  return mean_reward, std_reward

In [26]:
# Evaluate our Agent
mean_reward, std_reward = evaluate_agent(env, max_steps, n_eval_episodes, Qtable_gridworld, eval_seed)
print(f"Mean_reward={mean_reward:.2f} +/- {std_reward:.2f}")

100%|██████████| 100/100 [00:00<00:00, 8438.73it/s]

Mean_reward=1.00 +/- 0.00



