In [1]:
!pip install gymnasium[toy_text]

Collecting gymnasium[toy_text]
  Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting farama-notifications>=0.0.1 (from gymnasium[toy_text])
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1


In [19]:
import numpy as np
import random
import gymnasium as gym

# step types
STEPTYPE_FIRST = 0
STEPTYPE_MID = 1
STEPTYPE_LAST = 2

Q = np.random.uniform(size=(500, 6))

In [32]:
env = gym.make('Taxi-v3', render_mode='ansi')

In [33]:
obs, _ = env.reset()

In [36]:
print(env.render())

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1m[43mY[0m[0m| : |B: |
+---------+




In [None]:
# wrapper for gym's blackjack environment
def generate_start_step():
    return { 'observation': env.reset(), 'reward': 0., 'step_type': STEPTYPE_FIRST }

def generate_next_step(step, action):
    obs, reward, done, _, info = env.step(action)
    step_type = STEPTYPE_LAST if done else STEPTYPE_MID
    return { 'observation': obs, 'reward': reward, 'step_type': step_type }

In [None]:
epsilon = 0.1

def get_eps_soft_action(step):
    # epsilon-soft greedy policy
    # ---- fill here ----

In [None]:
def get_greedy_action(step):
    observ = step['observation']
    return np.argmax(Q[observ])

In [None]:
def get_random_action(step):
    return random.randint(0, env.action_space.n-1)

behavior_prob_hit = 1. / float(env.action_space.n)

In [None]:
# return true if (observ, action) exists in epi
def in_episode(epi, observ, action):
    for s, a in zip(*epi):
        if s['observation'] == observ and a == action:
            return True
    return False

In [None]:
def generate_episode(policy_func=get_random_action):
    episode = list()
    actions = list()
    frames = list()
    step = generate_start_step()
    frames.append(env.render(mode='ansi'))
    episode.append(step)
    while step['step_type'] != STEPTYPE_LAST:
        action = policy_func(step)
        step = generate_next_step(step, action)
        frames.append(env.render(mode='ansi'))
        episode.append(step)
        actions.append(action)
    return episode, actions, frames

In [None]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame)
        sleep(.2)

In [None]:
maxiter = 100000
gamma = 1
epsilon = 0.3
lr_rate = 0.8

Q = np.random.uniform(size=(env.observation_space.n, env.action_space.n))

for _ in range(maxiter):
    # starting step
    step = generate_start_step()
    action = get_random_action(step)
    done = False
    while not done:
        next_step = generate_next_step(step, action)

        if next_step['step_type'] == STEPTYPE_LAST:
            state = step['observation']
            idx1 = (state, action)
            Q[idx1] = Q[idx1] + lr_rate * (next_step['reward'] - Q[idx1])
            done = True
        else:
            best_action = get_greedy_action(next_step)

            state = step['observation']
            next_state = next_step['observation']
            idx1 = (state, action)
            idx2 = (next_state, best_action)
            Q[idx1] = Q[idx1] + lr_rate * ((next_step['reward'] + gamma * Q[idx2]) - Q[idx1])

            next_action = get_eps_soft_action(step)

            step = next_step
            action = next_action


In [None]:
epi, actions, frames = generate_episode(policy_func=get_greedy_action)
print_frames(frames)