In [21]:
pip install gym

Note: you may need to restart the kernel to use updated packages.


In [22]:
import gym
env=gym.make("Taxi-v3").env
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [23]:
env.render()

+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



In [24]:
env.reset() # reset environment to a new , random state
env.render()

print("Action space {}".format(env.action_space))
print("State space {}".format(env.observation_space))

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1m[43mB[0m[0m: |
+---------+

Action space Discrete(6)
State space Discrete(500)


In [25]:
state=env.encode(2,2,2,0)
print("State: ",state)
env.s=state
env.render()

State:  248
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [26]:
env.P[328]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [28]:
env.s=328
epochs=0
penalties,reward=0,0
frames=[]
done=False
while not done:
    action=env.action_space.sample()
    state,reward,done,info=env.step(action)
    
    if reward ==-10:
        penalties+=1
        
    frames.append({
        'frame':env.render(mode='ansi'),
        'state':state,
        'action':action,
        'reward':reward
    }
    )
    epochs+=1
print("Timesteps taken: {}".format(epochs))
print("Penalties incurred:{}".format(penalties))

Timesteps taken: 739
Penalties incurred:229


In [29]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i,frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i+1}")
        print(f"State: {frame['state']}")
        print(f"Action:{frame['action']}")
        print(f"Reward:{frame['reward']}")
        sleep(.1)
              
print_frames(frames)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 739
State: 0
Action:5
Reward:20


In [30]:
import numpy as np
q_table=np.zeros([env.observation_space.n,env.action_space.n])

In [32]:
%%time
"""Training the agent"""
import random
from IPython.display import clear_output

#Hyperparameters
alpha=0.1
gamma=0.6
epsilon=0.1

# plotting metrics
all_epochs=[]
all_penalties=[]

for i in range(1,100001):
    state=env.reset()
    
    epochs,penalties,reward,=0,0,0
    done=False
    
    while not done:
        if random.uniform(0,1)<epsilon:
            action=env.action_space.sample()
        else:
            action=np.argmax(q_table[state])
        next_state,reward,done,info=env.step(action)
        
        old_value=q_table[state,action]
        next_max=np.max(q_table[next_state])
        
        new_value=(1-alpha)*old_value+alpha*(reward+gamma*next_max)
        q_table[state,action]=new_value
        
        if reward==-10:
            penalties+=1
        
        state=next_state
        epochs+=1
        
    if i % 100 ==0:
        clear_output(wait=True)
        print(f"Episode:{i}")
print("Training finished\n")


Episode:100000
Training finished

Wall time: 2min 10s


In [33]:
q_table[328]

array([ -2.39596226,  -2.27325184,  -2.41232531,  -2.36195904,
       -10.91297785, -10.60637925])

In [39]:
""" Evaluate agent's performance after Q-learning """
total_epochs,total_penalties=0,0
episodes=50

for _ in range(episodes):
    state=env.reset()
    epochs,penalties,reward=0,0,0
    done=False
    
    while not done:
        action= np.argmax(q_table[state])
        state,reward,done,info=env.step(action)
        
        if reward==-10:
            penalties+=1
        
        epochs+=1
        clear_output(wait=True)
        env.render()
        sleep(0.1)
    total_penalties+=penalties
    total_epochs+=epochs
print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs/episodes}")
print(f"Average penalties per episode: {total_penalties/episodes}")

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)
Results after 50 episodes:
Average timesteps per episode: 12.4
Average penalties per episode: 0.0
