In [7]:
# --- Q-Learning --- #

import numpy as np
import random
import gym
import time
from IPython.display import clear_output

def print_frames(frames, delay):
        for i, frame in enumerate(frames):
            clear_output(wait=True)
            print(frame['frame'].getvalue())
            print(f"Timestep: {i+1}")
            print(f"State: {frame['state']}")
            print(f"Action: {frame['action']}")
            print(f"Reward: {frame['reward']}")
            sleep(delay)

env = gym.make("Taxi-v2").env # Creates random taxi environment

q_table = np.zeros([env.observation_space.n, env.action_space.n]) # Q-table with all zeros

alpha = 0.1 # Learning Rate
gamma = 0.6 # Discount Factor of Next State given the Action
epsilon = 0.1 # Chance of Exploration vs. Exploitation

total_episodes = 10000

start = time.clock()
                  
for episode in range(1, total_episodes+1):
    state = env.reset()
    
    frames = []
    epochs, penalties, reward = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Exploration
        else:
            action = np.argmax(q_table[state]) # Exploitation
            
        next_state, reward, done, info = env.step(action)
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value
        
        if reward == -10:
            penalties += 1
            
        state = next_state
        epochs += 1
    
    if episode % 1000 == 0:
        clear_output(wait=True)
        print(f"Episode: {episode}")
    
print("Training Finished")
print(f"{time.clock() - start}s")

Episode: 10000
Training Finished
9.140625s


In [6]:
frames = []
epochs, penalties, score = 0, 0, 0

env.s = int(input("Choose state number: "))
env.render()
input("Press Enter to Begin")

done = False

while not done:
    action = np.argmax(q_table[state])
    state, reward, done, info = env.step(action)
    score += reward
    
    if reward == -10:
        penalties += 1

    epochs += 1
    
    clear_output(wait=True)
    env.render()
    print(f"Timestep: {epochs}")
    print(f"Score: {score}")
    sleep(0.5)

KeyboardInterrupt: 