In [1]:
import gym
env = gym.make('Taxi-v3')

In [2]:
env.reset()
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [3]:
print('# states: ', env.observation_space.n)
print('# actions: ', env.action_space.n)

# states:  500
# actions:  6


In [4]:
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [5]:
env.step(0)

(286, -1, False, {'prob': 1.0})

In [6]:
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (South)


In [7]:
action = 0
state, reward, done, info = env.step(action)

In [8]:
print(state)

386


In [9]:
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|[35mY[0m| : |B: |
+---------+
  (South)


In [10]:
action = 0
state, reward, done, info = env.step(action)
env.render()

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B:[43m [0m|
+---------+
  (South)


In [12]:
epochs = 0
penalties, reward = 0, 0
env.reset()

frames = [] # for rendering
done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)
    
    if reward == -10:
        penalties += 1
        
    frames.append({
        'frame':env.render(mode='ansi'),
        'state':state,
        'action':action,
        'reward':reward,
    })
    epochs += 1    

In [13]:
print('Timesteps taken: {}'.format(epochs))
print('Penalties incurred: {}'.format(penalties))

Timesteps taken: 200
Penalties incurred: 60


In [12]:
#frames

In [14]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print_a_frame(frame, i)
        sleep(.5)
        
def print_a_frame(frame, idx=None):
    clear_output(wait=True)
    print(frame['frame'])
    if idx:
        print('Timestep: {}'.format(idx+1))
    print('State: {}'.format(frame['state']))
    print('Action: {}'.format(frame['action']))
    print('Reward: {}'.format(frame['reward']))

In [15]:
print_frames(frames)

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| |[42m_[0m: | : |
|Y| : |B: |
+---------+
  (Pickup)

Timestep: 200
State: 337
Action: 4
Reward: -10


In [16]:
import numpy as np
class QLearning():
    def __init__(self, gym_env, epsilon, gamma, alpha):
        self.gym_env = gym_env
        self._q_table = np.zeros([self.gym_env.observation_space.n, self.gym_env.action_space.n])
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = alpha
        
    def take_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            action = self.gym_env.action_space.sample() # Explore action space
        else:
            action = np.argmax(self._q_table[state]) # Exploit learned values
        return action
    
    def infer_action(self, state):
        action = np.argmax(self._q_table[state])
        return action
    
    def update_q_table(self, state, action, next_state, reward):
        old_value = self._q_table[state, action]
        next_state_max_q_value = np.max(self._q_table[next_state])
        
        new_value = (1 - self.alpha) * old_value + self.alpha * (reward + self.gamma * next_state_max_q_value)
        self._q_table[state, action] = new_value

In [17]:
# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1
num_of_episodes = 100001

In [18]:
q_learn = QLearning(env, epsilon, gamma, alpha)

In [19]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, num_of_episodes):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        #get action with epsilon 
        action = q_learn.take_action(state)
        next_state, reward, done, info = env.step(action) 
        q_learn.update_q_table(state, action, next_state, reward)
        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print('Episode: {}'.format(i))

print("Training finished.\n")

Episode: 100000
Training finished.

CPU times: user 1min 14s, sys: 26.2 s, total: 1min 40s
Wall time: 1min 13s


In [20]:
q_learn._q_table[328]

array([ -2.40565894,  -2.27325184,  -2.40506321,  -2.35774994,
        -9.92796807, -10.43445066])

In [21]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
num_of_episodes = 100
frames = []

for _ in range(num_of_episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        #import pdb; pdb.set_trace()
        action = q_learn.infer_action(state)
        state, reward, done, info = env.step(action)
        frames.append({
            'frame':env.render(mode='ansi'),
            'state':state,
            'action':action,
            'reward':reward,
        })
        
        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print('Results after {} num_of_episodes:'.format(num_of_episodes))
print('\tAverage timesteps per episode: {}'.format(total_epochs / num_of_episodes))
print('\tAverage penalties per episode: {}'.format(total_penalties / num_of_episodes))

Results after 100 num_of_episodes:
	Average timesteps per episode: 12.86
	Average penalties per episode: 0.0


In [22]:
len(frames)

1286

In [23]:
print_frames(frames)

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+
  (South)

Timestep: 349
State: 409
Action: 0
Reward: -1


KeyboardInterrupt: 