In [1]:
!pip install cmake 'gym[atari]' scipy



In [2]:
import gym

In [3]:
env=gym.make("Taxi-v3").env
env.render()

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |B: |
+---------+



In [4]:
env.reset()#reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|R: | : :[34;1m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


In [5]:
state=env.encode(3,1,2,0) #(taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s=state
env.render()

State: 328
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |B: |
+---------+



In [6]:
env.P[328] #{action: [(probability, nextstate, reward, done)]}

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [7]:
import numpy as np

In [8]:
q_table=np.zeros([env.observation_space.n, env.action_space.n])

In [12]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

#Hyperparameters
alpha=0.1
gamma=0.6
epsilon=0.1

#for plotting metrics
all_epochs=[]
all_penalties=[]

for i in range(1,100001):
  state=env.reset()
  epochs, penalties, reward=0, 0, 0
  done=False

  while not done:
    if random.uniform(0,1) < epsilon:
      action=env.action_space.sample() #explore action space
    else:
      action=np.argmax(q_table[state]) #exploit learned values

    next_state, reward, done, info=env.step(action)

    old_value=q_table[state, action]
    next_max=np.max(q_table[next_state])

    new_value=(1-alpha) * old_value + alpha * (reward+gamma*next_max)
    q_table[state,action]=new_value

    if reward ==-10:
      penalties += 1
    state=next_state
    epochs += 1

  if i%100 == 0:
    clear_output(wait=True)
    print(f"Episode: {i}")
    print(f"Penalties: {penalties}")

  print("Training Finsihed!")

Episode: 100000
Penalties: 0
Training Finsihed!
CPU times: user 1min 44s, sys: 24.5 s, total: 2min 9s
Wall time: 1min 56s


In [13]:
q_table[328]

array([ -2.40637903,  -2.27325184,  -2.40246382,  -2.3588909 ,
       -11.03764391, -10.80570654])

In [16]:
#Evaluate agent's performance after QLearning
total_epochs, total_penalties=0, 0
episodes=1000

for _ in range(episodes):
  state=env.reset()
  epochs, penalties, reward=0, 0, 0
  done=False

  while not done:
    action=np.argmax(q_table[state]) #expoitation
    state, reward, done, info=env.step(action)

    if reward==-10:
      penalties += 1
    
    epochs += 1

  total_penalties += penalties
  total_epochs += epochs

print(f"Results after {episodes} episode.")
print(f"Average timesteps per episode: {total_epochs/episodes}")
print(f"Average penalties per episode: {total_penalties/episodes}")
print(f"Total penalties {total_penalties} after 100 episodes.")

Results after 1000 episode.
Average timesteps per episode: 13.049
Average penalties per episode: 0.0
Total penalties 0 after 100 episodes.
