In [2]:
import numpy as np
import gym
import random

In [3]:
# create Taxi environment
env = gym.make('Taxi-v3', render_mode="rgb_array")

In [4]:
# initialize q-table
state_size = env.observation_space.n
action_size = env.action_space.n
qtable = np.zeros((state_size, action_size))

In [5]:
# hyperparameters
learning_rate = 0.9
discount_rate = 0.8
epsilon = 1.0
decay_rate= 0.005

In [6]:
# training variables
num_episodes = 1000
max_steps = 99 # per episode

In [7]:
# training
for episode in range(num_episodes):

    # reset the environment
    state = env.reset()
    done = False

    for s in range(max_steps):

        # exploration-exploitation tradeoff
        if random.uniform(0,1) < epsilon:
            # explore
            action = env.action_space.sample()
        else:
            # check whether state is an int
            if type(state) == int:
                pass
            else:
                state = state[0]

            # exploit
            action = np.argmax(qtable[state,:])

        # take action and observe reward
        new_state, reward, done, _, info = env.step(action)

        # check whether state is an int
        if type(state) == int:
            pass
        else:
            state = state[0]

        # Q-learning algorithm
        qtable[state,action] = qtable[state,action] + learning_rate * (reward + discount_rate * np.max(qtable[new_state,:])-qtable[state,action])

        # Update to our new state
        state = new_state

        # if done, finish episode
        if done == True:
            break

    # Decrease epsilon
    epsilon = np.exp(-decay_rate*episode)

print(f"Training completed over {num_episodes} episodes")
input("Press Enter to watch trained agent...")

Training completed over 1000 episodes
Press Enter to watch trained agent...


''

In [8]:
# watch trained agent
state = env.reset()
done = False
rewards = 0

In [9]:
for s in range(max_steps):

    print(f"TRAINED AGENT")
    print("Step {}".format(s+1))

    # check whether state is an int
    if type(state) == int:
        pass
    else:
        state = state[0]

    action = np.argmax(qtable[state,:])
    new_state, reward, done, _, info = env.step(action)
    rewards += reward
    env.render()
    print(f"score: {rewards}")
    state = new_state

    if done == True:
        break

TRAINED AGENT
Step 1
score: -1
TRAINED AGENT
Step 2
score: -2
TRAINED AGENT
Step 3
score: -3
TRAINED AGENT
Step 4
score: -4
TRAINED AGENT
Step 5
score: -5
TRAINED AGENT
Step 6
score: -6
TRAINED AGENT
Step 7
score: -7
TRAINED AGENT
Step 8
score: -8
TRAINED AGENT
Step 9
score: -9
TRAINED AGENT
Step 10
score: -10
TRAINED AGENT
Step 11
score: -11
TRAINED AGENT
Step 12
score: -12
TRAINED AGENT
Step 13
score: 8


In [10]:
env.close()