In [None]:
import gymnasium as gym
import numpy as np
import ipywidgets as widgets
import sys

sys.path.append('../')
import support_modules as sm

# Taxi

## Description
<div style="text-align: justify">    
There are four designated pick-up and drop-off locations (Red, Green, Yellow and Blue) in the 5x5 grid world. The taxi starts off at a random square and the passenger at one of the designated locations.

The goal is move the taxi to the passenger’s location, pick up the passenger, move to the passenger’s desired destination, and drop off the passenger. Once the passenger is dropped off, the episode ends.

The player receives positive rewards for successfully dropping-off the passenger at the correct location. Negative rewards for incorrect attempts to pick-up/drop-off passenger and for each step where another reward is not received.

Map:

    +---------+
    |R: | : :G|
    | : | : : |
    | : : : : |
    | | : | : |
    |Y| : |B: |
    +---------+

From “Hierarchical Reinforcement Learning with the MAXQ Value Function Decomposition” by Tom Dietterich.
</div>

https://gymnasium.farama.org/environments/toy_text/taxi/

# Random Policy

## Single episode

In [None]:
env = gym.make('Taxi-v3', render_mode='human')
state, _ = env.reset()
done = False

while not done:
    action = env.action_space.sample()
    state, reward, terminated, truncated, info = env.step(action)
    env.render()
    
    done = sm.evaluate_done(terminated,truncated)

env.close()

## Exploratory 1000 episodes

In [None]:
env = gym.make('Taxi-v3', render_mode=None)

rewards = list()
success = list()

for episode in range(1000):
    state, _ = env.reset()
    ep_reward = 0
    done = False

    while not done:
        action = env.action_space.sample()
        state, reward, terminated, truncated, info = env.step(action)
        ep_reward += reward
        
        done = sm.evaluate_done(terminated,truncated)
    
    rewards.append(ep_reward)
    success.append(terminated)

env.close()
print(f'Success rate: {sum(success)/len(success)}')
print(f'Average reward: {sum(rewards)/len(rewards)}')

# User Custom Policy (interacting with environment)

In [None]:
env = gym.make('Taxi-v3', render_mode='human')
state, _ = env.reset()
done = False

while not done:
    action = int(input('Select next action'))
    state, reward, terminated, truncated, info = env.step(action)
    env.render()
    
    done = sm.Q_Learning_Agent.evaluate_done(terminated,truncated)

env.close()