In [22]:
import gymnasium as gym
import numpy as np
import pygame
import random

In [23]:
env = gym.make(id="Taxi-v3",render_mode="human")

In [24]:
def ValueIteration(env,gamma=0.90,epsilon=1e-3,maxEpisodes=1000):
    S = list(range(env.observation_space.n))
    V = {s: 0 for s in S}

    maxDiff = epsilon+0.1
    episode=0
    while maxDiff > epsilon and episode < maxEpisodes:
        for s in S:
            v = -np.inf
            stateChangingActions = env.unwrapped.action_mask(s)
            stateChangingActions = np.where(stateChangingActions)[0]

            tempV = 0
            for a in stateChangingActions:
                _,nextState,reward,_ = env.unwrapped.P[s][a][0]
                tempV = reward + gamma*V[nextState]
                v = max(v,tempV)
            
            maxDiff = max(maxDiff,abs(V[s]-v))
            V[s] = v
        
        episode+=1
        print(f"Completed {episode/maxEpisodes*100} % of episodes")
        print(f"Max difference: {maxDiff}")

    print(f"done")
    return V

In [25]:
def Policy(env,V,gamma=0.90):
    S = list(range(env.observation_space.n))
    A = list(range(env.action_space.n))
    Policy = {s: random.sample(A,1)[0] for s in S}

    for s in S:
        maxV = -np.inf
        optimalAction = -1

        stateChangingActions = env.unwrapped.action_mask(s)
        stateChangingActions = np.where(stateChangingActions)[0]

        for a in stateChangingActions:
            _,nextState,reward,_ = env.unwrapped.P[s][a][0]
            tempV = 0
            tempV += reward + gamma * V[nextState]
            
            if tempV > maxV:
                maxV = tempV
                optimalAction = a
        
        Policy[s] = optimalAction
    
    return Policy

In [26]:
V = ValueIteration(env)
Policy = Policy(env,V)

Completed 0.1 % of episodes
Max difference: 19.1
Completed 0.2 % of episodes
Max difference: 19.1
Completed 0.3 % of episodes
Max difference: 19.1
Completed 0.4 % of episodes
Max difference: 19.1
Completed 0.5 % of episodes
Max difference: 19.1
Completed 0.6 % of episodes
Max difference: 19.1
Completed 0.7000000000000001 % of episodes
Max difference: 19.1
Completed 0.8 % of episodes
Max difference: 19.1
Completed 0.8999999999999999 % of episodes
Max difference: 19.1
Completed 1.0 % of episodes
Max difference: 19.1
Completed 1.0999999999999999 % of episodes
Max difference: 19.1
Completed 1.2 % of episodes
Max difference: 19.1
Completed 1.3 % of episodes
Max difference: 19.1
Completed 1.4000000000000001 % of episodes
Max difference: 19.1
Completed 1.5 % of episodes
Max difference: 19.1
Completed 1.6 % of episodes
Max difference: 19.1
Completed 1.7000000000000002 % of episodes
Max difference: 19.1
Completed 1.7999999999999998 % of episodes
Max difference: 19.1
Completed 1.9 % of episodes


In [27]:
observation, info = env.reset(seed=42)
done = False

while not done:
   action  = Policy[observation]
   observation, reward, terminated, truncated, info = env.step(action)

   if terminated or truncated:
      observation, info = env.reset()
   
   for event in pygame.event.get():
      if event.type == pygame.KEYDOWN:
         if event.key == pygame.K_ESCAPE:
            done = True

env.close()