In [1]:
import gym
import numpy as np
import random
import time

In [2]:
def choose_action(state, Q, epsilon=0):
    n_states, n_actions = Q.shape
    if random.uniform(0.0, 1.0) < epsilon:
        return random.randint(0, n_actions-1)
    else:
        return np.argmax(Q[state])

def train(env: gym.Env, alpha, gamma, epsilon=0.1, epoches=1000):
    """trainning at env, with params alpha, gamma"""
    Q = np.zeros((env.observation_space.n, env.action_space.n))
    for epoch in range(epoches):
        si, _ = env.reset()
        total_reward = 0
        total_error = 0
        terminated = False
        if epoch > 0.90 * epoches:
            real_eps = 0
        else:
            real_eps = epsilon * (1 - epoch/epoches)
        while not terminated:
            ai = choose_action(si, Q, real_eps)
            sn, ri, terminated, truncated, info = env.step(ai)
            Q[si, ai] = Q[si, ai] + alpha * (ri + gamma * np.max(Q[sn]) - Q[si, ai])
            si = sn
            total_reward += ri
            if ri == -10:
                total_error += 1
        if epoch % 1000 == 0:
            print(f"epoch {epoch} finished, epsilon {real_eps:.2f}, total reward: {total_reward}, total error: {total_error}")
    
    return Q

def test(env: gym.Env, Q, epoches=100):
    rewards = []
    errors = 0.0
    for epoch in range(epoches):
        si, _ = env.reset()
        terminated = False
        total_reward = 0
        while not terminated:
            ai = choose_action(si, Q, epsilon=0)
            sn, ri, terminated, truncated, info = env.step(ai)
            total_reward += ri
            si = sn
            if ri == -10:
                errors += 1
        rewards.append(total_reward)
    avg_rewards = np.average(rewards)
    avg_errors = errors/epoches
    print(f"avg reward: {avg_rewards}, avg errors: {avg_errors}")

env = gym.make("Taxi-v3")
gamma = 1
alpha = 0.05
epsilon = 0.2

Q = train(env, alpha, gamma, epsilon, epoches=100000)
test(env, Q)

epoch 0 finished, epsilon 0.20, total reward: -1706, total error: 118
epoch 1000 finished, epsilon 0.20, total reward: -21, total error: 2
epoch 2000 finished, epsilon 0.20, total reward: 10, total error: 0
epoch 3000 finished, epsilon 0.19, total reward: -7, total error: 1
epoch 4000 finished, epsilon 0.19, total reward: 6, total error: 0
epoch 5000 finished, epsilon 0.19, total reward: -18, total error: 2
epoch 6000 finished, epsilon 0.19, total reward: -18, total error: 2
epoch 7000 finished, epsilon 0.19, total reward: 8, total error: 0
epoch 8000 finished, epsilon 0.18, total reward: -2, total error: 1
epoch 9000 finished, epsilon 0.18, total reward: 3, total error: 1
epoch 10000 finished, epsilon 0.18, total reward: -14, total error: 2
epoch 11000 finished, epsilon 0.18, total reward: -6, total error: 1
epoch 12000 finished, epsilon 0.18, total reward: 2, total error: 1
epoch 13000 finished, epsilon 0.17, total reward: 11, total error: 0
epoch 14000 finished, epsilon 0.17, total 

In [7]:
def test_visual(Q):
    env = gym.make("Taxi-v3", render_mode="human")
    si, _ = env.reset()
    env.render()
    terminated = False
    while not terminated:
        time.sleep(1)
        ai = choose_action(si, Q)
        sn, ri, terminated, _, _ = env.step(ai)
        env.render()
        si = sn
        print(f"action: {ai}, reward: {ri}")

test_visual(Q)

action: 2, reward: -1
action: 1, reward: -1
action: 4, reward: -1
action: 0, reward: -1
action: 3, reward: -1
action: 0, reward: -1
action: 3, reward: -1
action: 3, reward: -1
action: 3, reward: -1
action: 1, reward: -1
action: 1, reward: -1
action: 5, reward: 20


In [8]:
print(env.action_space)

Discrete(6)
