In [1]:
import gym
import numpy as np
from time import sleep
from tqdm.notebook import trange

In [2]:
env = gym.make("Taxi-v3")
observation = env.reset()

In [3]:
# Play game
def play(policy, iterations=100, wait=False):
    observation = env.reset()
    for _ in trange(iterations):
        env.render()
        observation, reward, done, info = env.step(policy(observation))

        if done:
            observation = env.reset()
        if wait:
            sleep(1)

In [4]:
# # Manual moves
# state = env.step(5)
# print(state)
# env.render()
# print(state[1], state[2], state[3])
# [x for x in env.decode(state[0])]

In [5]:
class Q_model:
    def __init__(self, env, curr_state, lr=.4, df=.5, eps=.3):
        self.env = env
        self.state = curr_state
        self.eps = eps
        
        n_states = env.observation_space.n
        n_actions = env.action_space.n
        self.table = np.zeros((n_states, n_actions))
        self.lr = lr # [0..1]
        self.df = df # [0..1]

    def update_iteration(self, s_t, action):
        s_t1, reward, done, info = env.step(action)
        max_q = np.max(self.table[s_t1])
        prev_q = self.table[s_t, action]

        self.table[s_t, action] = (1-self.lr) * prev_q +  self.lr*(reward + self.df*max_q)
        
        return s_t1, reward, done, info
        
    def train_loop(self, epochs=1000000, render=False):
        for _ in trange(epochs):
            self.state = env.reset()
            done = False
            while not done:
                if np.random.uniform(0, 1) < self.eps:
                    action = self.env.action_space.sample()
                else:
                    action = np.argmax(self.table[self.state])

                self.state, _, done, _ = self.update_iteration(self.state, action)
                
                if render:
                    env.render()
                
    def get_move(self, state):
        return np.argmax(self.table[state])

model = Q_model(env, observation)

In [6]:
model.train_loop(epochs=10000, render=False)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [7]:
play(model.get_move)

  0%|          | 0/100 [00:00<?, ?it/s]

In [8]:
env.close()