# Teach a Taxi to pick up and drop off passengers at the right locations with Reinforcement Learning

In [0]:
import gym
import numpy as np
import pickle, os

In [0]:
env = gym.make("Taxi-v2")

In [0]:
state = env.reset()

In [0]:
state

46

In [0]:
env.render()

+---------+
|R: |[43m [0m: :[34;1mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+



<h1>Possible Actions</h1>

down (0), up (1), right (2), left (3), pick-up (4), and drop-off (5)

In [0]:
n_states = env.observation_space.n
n_actions = env.action_space.n

In [0]:
n_actions

6

In [0]:
n_states

500

In [0]:
env.env.s = 254

In [0]:
env.render()

+---------+
|R: | : :G|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



In [0]:
env.step(3)

(234, -1, False, {'prob': 1.0})

In [0]:
env.render()

+---------+
|R: | : :G|
| : : : : |
| :[43m [0m: : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+
  (West)


<h1>How good does behaving completely random do?</h1>

In [0]:
state = env.reset()
counter = 0
g = 0
reward = None

In [0]:
env.render()

+---------+
|R: | : :[35mG[0m|
| : : : : |
| : : : : |
| | : | : |
|[34;1mY[0m|[43m [0m: |B: |
+---------+



In [0]:
while reward != 20:
    state, reward, done, info = env.step(env.action_space.sample())
    counter += 1
    g += reward

In [0]:
print("Solved in {} Steps with a total reward of {}".format(counter,g))

Solved in 1975 Steps with a total reward of -7579


## Let's look at just one episode and see how the Q values change after each step using the formula below

In [0]:
Q = np.zeros([n_states, n_actions])

In [0]:
episodes = 1
alpha = 0.618

In [0]:
for episode in range(1,episodes+1):
    done = False
    reward = 0
    state = env.reset()
    firstState = state
    print("Initial State = {}".format(state))
    while reward != 20:
        action = np.argmax(Q[state]) 
        state2, reward, done, info = env.step(action)
        Q[state,action] = Q[state,action] +  alpha * (reward + np.max(Q[state2]) - Q[state,action]) 
        state = state2

Initial State = 263


In [0]:
firstState

484

In [0]:
finalState = state
finalState

0

## Let's look at the first step:

In [0]:
firstState

44

## Let's look at the final step:

In [0]:
finalState

0

In [0]:
Q

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

## Let's run over multiple episodes so that we can converge on a optimal policy

In [0]:
episodes = 500
rewardTracker = []

In [0]:
G = 0
alpha = 0.618

In [0]:
for episode in range(1,episodes+1):
    done = False
    G, reward = 0,0
    state = env.reset()
    while done != True:
        action = np.argmax(Q[state]) 
        state2, reward, done, info = env.step(action) 
        Q[state,action] += alpha * ((reward + (np.max(Q[state2]))  - Q[state,action]))
        G += reward
        state = state2
        
    if episode % 100 == 0:
        print('Episode {} Total Reward: {}'.format(episode,G))

Episode 100 Total Reward: -137
Episode 200 Total Reward: -10
Episode 300 Total Reward: 12
Episode 400 Total Reward: 4
Episode 500 Total Reward: 7


In [0]:
Q

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [-4.69422875, -4.326     , -4.5307258 , -4.326     , 12.99999975,
        -6.18      ],
       [-4.14804284, -3.708     , -3.47193932, -3.708     , 15.        ,
        -6.18      ],
       ...,
       [-2.472     , -2.48515958, -2.472     , -3.25333424, -6.18      ,
        -6.18      ],
       [-4.944     , -4.97179544, -4.944     , -5.5047722 , -6.18      ,
        -6.18      ],
       [-1.236     , -1.236     , -1.236     ,  6.784404  , -6.18      ,
        -6.18      ]])

## Now that we have learned the optimal Q Values we have developed a optimal policy and have no need to train the agent anymore

In [0]:
state = env.reset()
done = None

In [0]:
env.render()

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)


In [0]:
counter = 0
state = env.reset()
done = False

In [0]:
while done != True:
    # We simply take the action with the highest Q Value
    action = np.argmax(Q[state])
    state, reward, done, info = env.step(action)
    counter += 1
    #env.render()

In [0]:
counter

200

In [0]:
with open("smartTaxi_qTable.pkl", 'wb') as f:
    pickle.dump(Q, f)

In [0]:
with open("smartTaxi_qTable.pkl", 'rb') as f:
    Qtest = pickle.load(f)