In [1]:
%matplotlib notebook
import random
import numpy as np
from copy import deepcopy
import gym
import torch
from collections import namedtuple, defaultdict
import matplotlib.pyplot as pp

In [2]:
env = gym.make('Taxi-v2')

In [3]:
class TaxiPolicy:    
    def __init__(self):
        # (row, column, passenger_loc, destination, action)
        self.Q0 = torch.zeros([500, 6], dtype=torch.float32)
        self.Q1 = None
        self.num_actions = 6

    def action(self, state, epsilon=0.0, Q=None):
        if Q is None:       
            Q = self.Q0

        action_max_q, action_max_idx = torch.max(Q[state], 0)
        if epsilon > 0:
            # If probability is greater than epsilon, choose a random action
            p = random.uniform(0, 1)
            if p > epsilon:
                return random.randint(0, self.num_actions - 1)
        # Otherwise choose greedy max action
        return int(action_max_idx)
    
    def action_d(self, state, epsilon):
        action_max_q, action_max_idx = torch.max(self.Q0[state] + self.Q1[state], 0)
        if epsilon > 0:
            # If probability is greater than epsilon, choose a random action
            p = random.uniform(0, 1)
            if p > epsilon:
                return random.randint(0, self.num_actions - 1)
        # Otherwise choose greedy max action
        return int(action_max_idx)
        
    def train(self, env, iterations, epsilon, learning_rate, discount, algo):
        if algo == 'double-q-learning':
            self.Q1 = torch.zeros([500, 6], dtype=torch.float32)

        for i in range(iterations):
            state = env.reset()
            q_update = 0
            a_ = None
            ep_ended = False
            while not ep_ended:
                # If next action is not defined, take action based on current state
                action = a_ if a_ else self.action(state, epsilon)
                
                # (state', reward, ep_ended, prob=1.0)
                s_, r, ep_ended, prob = env.step(action)
                
                if algo == 'sarsa':
                    # Uses next action
                    a_ = self.action(s_, epsilon)
                    update = self.Q0[s_, a_]
                elif algo == 'q-learning':
                    update = self.Q0[s_, self.action(s_)]
                elif algo == 'expected-sarsa':
                    # Gives all actions (1 - epsilon) / num_action probabilities
                    next_action_probs = torch.ones([self.num_actions]).fill_((1 - epsilon) / self.num_actions)
                    
                    # Adds epsilon probability to best action
                    next_action_probs[self.action(s_)] += epsilon
                    
                    update = torch.sum(next_action_probs * self.Q0[s_, :])
                elif algo == 'double-q-learning':
                    # Next action uses both Q's
                    a_ = self.action_d(s_, epsilon)
                    
                    # Updates only one of the Q's using the other Q
                    q_update = random.randint(0, 2)
                    if q_update == 0:
                        update = self.Q1[s_, self.action(s_, Q=self.Q0)]
                    else:
                        update = self.Q0[s_, self.action(s_, Q=self.Q1)]  
                else:
                    raise Exception('Invalid algo')
                    
                if q_update == 0:
                    self.Q0[state, action] += learning_rate * (r + (discount * update) - self.Q0[state, action])
                else:
                    self.Q1[state, action] += learning_rate * (r + (discount * update) - self.Q1[state, action])

                state = s_

    def play(self, env):
        state = env.reset()
        ep_ended = False
                
        env.render()
        while not ep_ended:
            if self.Q1 is None:
                action = self.action(state, epsilon=0)
            else:
                action = self.action_d(state, epsilon=0)

            s_, r, ep_ended, prob = env.step(action)
            state = s_
            
            env.render()

In [4]:
t = TaxiPolicy()
t.train(env, iterations=10000, epsilon=0.9, learning_rate=0.1, discount=0.9, algo='double-q-learning')

In [5]:
print(t.Q0)
t.play(env)

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00],
        [-2.8650e+00, -1.7954e+00, -2.0245e+00, -1.2941e+00,  4.3489e+00,
         -5.7377e+00],
        [-1.5886e+00,  1.9355e-01, -2.9001e-01, -1.3402e+00,  7.7147e+00,
         -5.5879e+00],
        ...,
        [-8.7840e-01, -2.4084e-01, -1.0437e+00, -9.2251e-01, -2.0507e+00,
         -2.9247e+00],
        [-2.3107e+00, -2.3559e+00, -2.4237e+00,  1.2643e-02, -3.5358e+00,
         -5.1121e+00],
        [ 0.0000e+00, -1.4770e-01, -2.7100e-01,  1.4424e+01, -1.1710e+00,
         -1.0060e+00]])
+---------+
|[34;1mR[0m: | : :G|
| : : : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |[35mB[0m: |
+---------+

+---------+
|[34;1mR[0m: | : :G|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (West)
+---------+
|[34;1mR[0m: | : :G|
| : :[43m 