In [1]:
import numpy as np
import gym

# Create the Taxi environment
env = gym.make("Taxi-v3", render_mode="human")  

# change-able parameters:
discount_factor = 0.8 # 0.9
delta_threshold = 0.00001 # 1e-6
epsilon = 1

In [2]:
def value_iteration(env, gamma, epsilon):
    num_states = env.observation_space.n # 500
    num_actions = env.action_space.n # 6

    # Initialize the value function to be zero
    V = np.zeros(num_states)

    # Value iteration loop
    while True:
        delta = 0 # change in value function
        for i in range(num_states):
            v = V[i]

            # Compute Q-values of all actions from state i
            q_values = np.zeros(num_actions)
            for action in range(num_actions):
                q_value = 0
                for prob, next_state, reward, _ in env.P[i][action]:
                    q_value += prob * (reward + gamma * V[next_state])
                q_values[action] = q_value
            
            # Update value function to be the maximum Q-value
            V[i] = np.max(q_values)
            # Update the change in value function
            delta = max(delta, abs(v - V[i]))

        # Stop if value function has converged
        if delta < epsilon:
            break

    # For each state, the policy will tell you the action to take
    policy = np.zeros(num_states, dtype=int)

    # Extract optimal policy
    for i in range(num_states):

        q_values = np.zeros(num_actions)
        for action in range(num_actions):
            q_value = 0
            for prob, next_state, reward, _ in env.P[i][action]:
                q_value += prob * (reward + gamma * V[next_state])
            q_values[action] = q_value

        # Select action that maximizes Q-value
        policy[i] = np.argmax(q_values)

    return policy, V

In [3]:
# Run value iteration
policy, V = value_iteration(env, discount_factor, delta_threshold)

# Print results
print()
print("Optimal Value Function:")
print(V)

print()
print("\nOptimal Policy (0=Left, 1=Down, 2=Right, 3=Up, 4=Pickup, 5=Dropoff):")
print(policy)

# resetting the environment and executing the policy
state = env.reset()
state = state[0]
done = False
print() ; print(state)

max_steps = 100
for step in range(max_steps):

    # Getting max value against that state, so that we choose that action
    action = policy[state]

    # information after taking the action
    new_state, reward, done, truncated, info = env.step(action)
    
    env.render()
    if done:
        print("\nnumber of steps taken:", step)
        break

    state = new_state

env.close()


Optimal Value Function:
[ 4.16666638e+01  2.82936045e+00  1.41146596e+01  4.78669562e+00
 -3.68645298e+00  2.82936045e+00 -3.68645713e+00 -2.43448658e+00
  2.82936459e+00 -1.79309811e+00  1.41146596e+01 -9.91377570e-01
 -2.94758117e+00 -1.79309811e+00 -2.94758532e+00  4.78669562e+00
  5.33333310e+01  4.78670372e+00  1.88933277e+01  7.23337570e+00
  3.23333310e+01  1.26348836e+00  1.02917277e+01  2.82935650e+00
 -3.35806494e+00  4.78670372e+00 -3.35806825e+00 -1.79310206e+00
  1.26349167e+00 -2.43447849e+00  1.02917277e+01 -1.79310206e+00
 -2.43447517e+00 -9.91369474e-01 -2.43447849e+00  7.23337570e+00
  4.16666648e+01  7.23338217e+00  1.41146622e+01  1.02917246e+01
  7.23338483e+00 -2.94758279e+00  1.07906853e-02 -2.43448164e+00
  1.07933382e-02  2.48666622e+01  1.07906853e-02  4.78670056e+00
  1.07933382e-02 -2.94758279e+00  7.23338217e+00 -2.43448164e+00
 -1.79309293e+00  1.07906853e-02 -1.79309558e+00  1.02917246e+01
  1.02917319e+01  3.23333297e+01  1.02917297e+01  1.41146596e+01


  if not isinstance(terminated, (bool, np.bool8)):



number of steps taken: 11
