In [2]:
import gym
import random

random.seed(1234)

streets = gym.make("Taxi-v3").env #New versions keep getting released; if -v3 doesn't work, try -v2 or -v4
streets.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| :[43m [0m|[34;1mB[0m: |
+---------+



In [3]:
initial_state = streets.encode(2, 3, 2, 0)

streets.s = initial_state

streets.render()

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : :[43m [0m: |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+



In [4]:
import numpy as np

q_table = np.zeros([streets.observation_space.n, streets.action_space.n])
# a 2D array that represent every possible state and action in the virtual space and initialize all of them to 0
learning_rate = 0.1
discount_factor = 0.6
exploration = 0.1
epochs = 10000

for taxi_run in range(epochs):
    state = streets.reset()
    done = False
    
    while not done:
        random_value = random.uniform(0, 1)
        if (random_value < exploration):
            action = streets.action_space.sample() # Explore a random action
        else:
            action = np.argmax(q_table[state]) # Use the action with the highest q-value
            
        next_state, reward, done, info = streets.step(action)
        
        prev_q = q_table[state, action]
        next_max_q = np.max(q_table[next_state])
        new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
        q_table[state, action] = new_q
        
        state = next_state
        
        

In [9]:
q_table[initial_state]  

array([-2.40090669, -2.41412198, -2.41767969, -2.3639511 , -6.84836069,
       -8.62169302])

In [14]:
q_table[streets.encode(1,0,2,0)]  

array([-2.12208981, -2.23981204, -2.25062334, -2.22939021, -7.50948405,
       -7.91650559])

In [18]:
from IPython.display import clear_output
from time import sleep
lengths=[]
for tripnum in range(1, 11):
    state = streets.reset()
   
    done = False
    trip_length = 0
    
    while not done and trip_length < 25:
        action = np.argmax(q_table[state])
        next_state, reward, done, info = streets.step(action)
        clear_output(wait=True)
        print("Trip number " + str(tripnum) + " Step " + str(trip_length))
        print(streets.render(mode='ansi'))
        sleep(.2)
        state = next_state
        trip_length += 1
    lengths.append(trip_length)
    
    sleep(.2)
avg_len=sum(lengths)/10
print(avg_len)

Trip number 10 Step 13
+---------+
|R: | : :[35m[34;1m[43mG[0m[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

13.9


In [23]:
def q_learning(learning_rate,discount_factor,exploration,epochs):
    q_table = np.zeros([streets.observation_space.n, streets.action_space.n])
# a 2D array that represent every possible state and action in the virtual space and initialize all of them to 0
    for taxi_run in range(epochs):
        state = streets.reset()
        done = False
            
        while not done:
            random_value = random.uniform(0, 1)
            if (random_value < exploration):
                action = streets.action_space.sample() # Explore a random action
            else:
                action = np.argmax(q_table[state]) # Use the action with the highest q-value
                    
            next_state, reward, done, info = streets.step(action)
                
            prev_q = q_table[state, action]
            next_max_q = np.max(q_table[next_state])
            new_q = (1 - learning_rate) * prev_q + learning_rate * (reward + discount_factor * next_max_q)
            q_table[state, action] = new_q
                
            state = next_state



def average_trip_length():
    lengths=[]
    for tripnum in range(1, 11):
        state = streets.reset()
        done = False
        trip_length = 0
        
        while not done and trip_length < 25:
            action = np.argmax(q_table[state])
            next_state, reward, done, info = streets.step(action)
            clear_output(wait=True)
            state = next_state
            trip_length += 1
        lengths.append(trip_length)
    avg_len=sum(lengths)/10
    return avg_len

In [24]:
learning_rate = 0.1
discount_factor = [0.5,0.6,0.7,0.8,0.9]
exploration = 0.1
epochs = 1000
difdis=[0,0,0,0,0]
for j in range(1,10):
    for i in range(len(discount_factor)):
        q_learning(learning_rate,discount_factor[i],exploration,epochs)
        difdis[i]+=average_trip_length()

print(np.array(difdis)/10)

[12.12 12.62 12.38 12.07 12.  ]


In [25]:
learning_rate = [0.1,0.2,0.3,0.4,0.5]
discount_factor = 0.9
exploration = 0.1
epochs = 1000
difdis=[0,0,0,0,0]
for j in range(1,10):
    for i in range(len(learning_rate)):
        q_learning(learning_rate[i],discount_factor,exploration,epochs)
        difdis[i]+=average_trip_length()

print(np.array(difdis)/10)

[12.56 12.26 12.19 12.21 11.94]


In [26]:
learning_rate = 0.5
discount_factor = 0.5
exploration = [0.1,0.2,0.3,0.4]
epochs = 1000
difdis=[0,0,0,0]
for j in range(1,10):
    for i in range(len(exploration)):
        q_learning(learning_rate,discount_factor,exploration[i],epochs)
        difdis[i]+=average_trip_length()

print(np.array(difdis)/10)

[12.67 12.2  11.9  12.59]


In [29]:
learning_rate = 0.4
discount_factor = 0.5
exploration = 0.3
epochs = 1000
difdis=[]
for j in range(1,10):
        q_learning(learning_rate,discount_factor,exploration,epochs)
        difdis.append(average_trip_length())

print(sum(difdis)/10)

11.790000000000001
