###Q Learning example based on [notebook](https://colab.research.google.com/gist/simoninithomas/466c81aa1c2a07dd14793240c6d033c5/q-learning-with-taxi-v3.ipynb), [article](https://thomassimonini.medium.com/q-learning-lets-create-an-autonomous-taxi-part-1-2-3e8f5e764358).

### Import libraries

In [3]:
import numpy as np
import gym
import random

### Create OpenAI environment

In [4]:
env = gym.make("Taxi-v3")
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |[35mB[0m: |
+---------+



### Create Q Table

In [5]:
# Get number of states and actions from the environment
num_states = env.observation_space.n
num_actions = env.action_space.n
num_states, num_actions, env.action_space, env.observation_space

(500, 6, Discrete(6), Discrete(500))

In [7]:
# Create our Q table with state_size rows and action_size columns (500x6)
Q = np.zeros((num_states, num_actions))
Q.shape

(500, 6)

### E-greedy policy and Q-Learning training loop

In [23]:
def egreedy(state, epsilon):
  x = random.uniform(0,1)
  if (x < epsilon):
    action = random.randint(0, num_actions - 1)
  else:
    action = np.argmax(Q[state])
  return action

def ql_loop(num_episodes, max_steps, lr, gamma):
  epsilon_start, epsilon_end = 1.0, .001
  epsilon = epsilon_start

  for i_episode in range(num_episodes):
    state = env.reset()

    # Decay the epsilon for each episode
    epsilon_decay = (num_episodes - i_episode) / num_episodes
    epsilon = epsilon_end + (epsilon_start - epsilon_end) * epsilon_decay

    for t in range(max_steps):
      #env.render()

      # Take an action with the e-greedy policy and observe the results
      action = egreedy(state, epsilon)
      #print(state, action)
      next_state, reward, done, info = env.step(action)

      # Update the Q-value of the state-action taken.
      Q[state, action] += lr * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
      state = next_state

      # End of episode
      if done:
        #print("Episode finished after {} timesteps".format(t+1))
        break
  env.close()

### Define hyperparameters and train the agent

In [None]:
num_episodes = 25000
max_steps = 200
lr = 0.01
gamma = 0.99

ql_loop(num_episodes, max_steps, lr, gamma)
print (Q)

### Inference with trained agent

In [25]:
def inference(num_episodes, max_steps):
  total_rewards = 0
  for i_episode in range(num_episodes):
    state = env.reset()

    for t in range(max_steps):
      env.render()

      # Take an action with exploration
      action = np.argmax(Q[state])
      next_state, reward, done, info = env.step(action)
      total_rewards += reward
      state = next_state

      # End of episode
      if done:
        print("Episode finished after {} timesteps".format(t+1))
        break
  env.close()
  print (f'Average rewards = {total_rewards/num_episodes}')

inference(5, 200)

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :[35mG[0m|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (West)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+
  (South)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[42mY[0m| : |B: |
+---------+
  (Pickup)
+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
|[42m_[0m| : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|R: | : :[35mG[0m|
| : | : : |
|[4