In [2]:
%matplotlib notebook
import random
import numpy as np
from copy import deepcopy
import gym
import torch
from collections import namedtuple, defaultdict
import matplotlib.pyplot as pp

In [3]:
env = gym.make('Taxi-v2')

In [152]:
class TaxiPolicy:    
    def __init__(self):
        # (row, column, passenger_loc, destination, action)
        self.Q0 = torch.zeros([500, 6], dtype=torch.float32)
        self.Q1 = None
        self.num_actions = 6

    # Behavioral action
    def action_b(self, state):    
        return random.randint(0, self.num_actions - 1)
    def p_action_b(self, state, action):
        return 1 / self.num_actions
    
    # Action using Q0
    def action(self, state, epsilon=0.0, Q=None):
        if Q is None:       
            Q = self.Q0

        action_max_q, action_max_idx = torch.max(Q[state], 0)
        if epsilon > 0:
            # If probability is greater than epsilon, choose a random action
            p = random.uniform(0, 1)
            if p > epsilon:
                return random.randint(0, self.num_actions - 1)
        # Otherwise choose greedy max action
        return int(action_max_idx)

    def p_action(self, state, action, epsilon=0.0, Q=None):
        if Q is None:       
            Q = self.Q0

        action_max_q, action_max_idx = torch.max(Q[state], 0)
        if action == action_max_idx:
            return epsilon + ((1 - epsilon) / self.num_actions)
        else:
            return (1 - epsilon) / self.num_actions
        
    
    # Action using Q0 and Q1
    def action_d(self, state, epsilon):
        action_max_q, action_max_idx = torch.max(self.Q0[state] + self.Q1[state], 0)
        if epsilon > 0:
            # If probability is greater than epsilon, choose a random action
            p = random.uniform(0, 1)
            if p > epsilon:
                return random.randint(0, self.num_actions - 1)
        # Otherwise choose greedy max action
        return int(action_max_idx)
        
    def train(self, env, iterations, epsilon, learning_rate, discount, algo):
        if algo == 'double-q-learning':
            self.Q1 = torch.zeros([500, 6], dtype=torch.float32)

        for i in range(iterations):
            state = env.reset()
            q_update = 0
            a_ = None
            ep_ended = False
            while not ep_ended:
                # If next action is not defined, take action based on current state
                action = a_ if a_ else self.action(state, epsilon)
                
                # (state', reward, ep_ended, prob=1.0)
                s_, r, ep_ended, prob = env.step(action)
                
                if algo == 'sarsa':
                    # Uses next action
                    a_ = self.action(s_, epsilon)
                    update = self.Q0[s_, a_]
                elif algo == 'q-learning':
                    update = self.Q0[s_, self.action(s_)]
                elif algo == 'expected-sarsa':
                    # Gives all actions (1 - epsilon) / num_action probabilities
                    next_action_probs = torch.ones([self.num_actions]).fill_((1 - epsilon) / self.num_actions)
                    
                    # Adds epsilon probability to best action
                    next_action_probs[self.action(s_)] += epsilon
                    
                    update = torch.sum(next_action_probs * self.Q0[s_, :])
                elif algo == 'double-q-learning':
                    # Next action uses both Q's
                    a_ = self.action_d(s_, epsilon)
                    
                    # Updates only one of the Q's using the other Q
                    q_update = random.randint(0, 2)
                    if q_update == 0:
                        update = self.Q1[s_, self.action(s_, Q=self.Q0)]
                    else:
                        update = self.Q0[s_, self.action(s_, Q=self.Q1)]  
                else:
                    raise Exception('Invalid algo')
                    
                if q_update == 0:
                    self.Q0[state, action] += learning_rate * (r + (discount * update) - self.Q0[state, action])
                else:
                    self.Q1[state, action] += learning_rate * (r + (discount * update) - self.Q1[state, action])

                state = s_
    
    def train_nstep(self, env, iterations, epsilon, learning_rate, discount, n, nsigma, off_policy=False):
        assert len(nsigma) == n, 'nsigma must have n values of {0, 1}'
        bn = n + 1   # Buffers needs to hold n+1 values
        
        for ep in range(iterations):
            sbuffer = torch.zeros([bn], dtype=torch.int64)      # state buffer
            abuffer = torch.zeros([bn], dtype=torch.int64)      # action buffer
            qbuffer = torch.zeros([bn], dtype=torch.float32)    # state-action (Q) value buffer
            tdbuffer0 = torch.zeros([bn], dtype=torch.float32)  # tderror buffer (sigma=0)
            tdbuffer1 = torch.zeros([bn], dtype=torch.float32)  # tderror buffer (sigma=1)
            pbuffer = torch.zeros([bn], dtype=torch.float32)    # state-action prob buffer
            rbuffer = torch.zeros([bn], dtype=torch.float32)    # importance sampling ratio buffer
            
            T = np.inf     # End of episode time
            t = 0          # Current time in episode
            tau = 0        # Time whose estimate is being updated

            state = env.reset()
            if off_policy:
                action = self.action_b(state)
            else:
                action = self.action(state, epsilon)
            
            # Update initial buffers
            sbuffer[t % bn] = int(state)
            abuffer[t % bn] = int(action)
            qbuffer[t % bn] = self.Q0[state, action]

            while t < T:
                if t < T:
                    # (state', reward, ep_ended, prob=1.0)
                    state, r, ep_ended, prob = env.step(action)
#                     print(f'Overwriting state {(t + 1) % bn}')
                    sbuffer[(t + 1) % bn] = int(state)
                    
#                     print(f'Reward: {r}')
                    
                    if ep_ended:
                        T = t + 1
                        tdbuffer0[t % bn] = r - qbuffer[t % bn]
                        tdbuffer1[t % bn] = r - qbuffer[t % bn]
                    else:
                        if off_policy:
                            action = self.action_b(state)
                        else:
                            action = self.action(state, epsilon)
                        
                        abuffer[(t + 1) % bn] = int(action)
                        qbuffer[(t + 1) % bn] = self.Q0[state, action]

                        # For expected update sigma=0
#                         print('Sigma = 0')
                        # Gives all actions (1 - epsilon) / num_action probabilities
                        next_action_probs = torch.ones([self.num_actions]).fill_((1 - epsilon) / self.num_actions)

                        # Adds epsilon probability to best action
                        next_action_probs[self.action(state)] += epsilon
                        
#                         print(f'Next action probs {next_action_probs}')
#                         print(f'State values {self.Q0[state, :]}')

                        tdbuffer0[t % bn] = r + (discount * torch.sum(next_action_probs * self.Q0[state, :])) - qbuffer[t % bn]

#                         print(f'tdbuffer0: {tdbuffer0}')
                        # For td update sigma=1

                        tdbuffer1[t % bn] = r + (discount * qbuffer[(t + 1) % bn]) - qbuffer[t % bn]
#                         print(f'tdbuffer1: {tdbuffer1}')

#                         print(f'Updating pbuffer {self.p_action(state, action, epsilon)}')
                        pbuffer[(t + 1) % bn] = self.p_action(state, action, epsilon)
                        rbuffer[(t + 1) % bn] = pbuffer[(t + 1) % bn] / self.p_action_b(state, action)
                tau = t - n + 1
                if tau >= 0:
#                     print(f'Updating at t={t} for {tau % bn}')
                    p = 1
                    z = 1
                    G = qbuffer[tau % bn]
#                     print(f'G={G}')

#                     print(f'sbuffer: {sbuffer}')
#                     print(f'abuffer: {abuffer}')
#                     print(f'tdbuffer0: {tdbuffer0}')
#                     print(f'tdbuffer1: {tdbuffer1}')
#                     print(f'pbuffer: {pbuffer}')
#                     print(f'rbuffer: {rbuffer}')
                    for i,k in enumerate(range(tau, min(tau + n, T))):
#                         print(f'iter {k%bn}')
#                         print(f'nsigma={nsigma[i]}')
                        if nsigma[i] == 0:
#                             print(f't at z,tdbuffer0 = {z},{tdbuffer0[k % n]}')
                            G += (z * tdbuffer0[k % bn])
                        else:
#                             print(f't at z,tdbuffer1 = {z},{tdbuffer1[k % n]}')
                            G += (z * tdbuffer1[k % bn])
#                         print(f'G={G}')
                        z *= (discount * (((1 - nsigma[(i + 1) % n]) * pbuffer[(k + 1) % bn]) + nsigma[(i + 1) % n]))
                        p *= (1 - nsigma[i % n] + (nsigma[i % n] * rbuffer[k % bn]))
#                     print(f'Q0 = {self.Q0[sbuffer[tau % n], abuffer[tau % n]]}')
#                     print(f'Sapmling ratio = {p}')
#                     print(f'Update = {(learning_rate * p * (G - self.Q0[sbuffer[tau % n], abuffer[tau % n]]))}')
                    
#                     print(f'Update: {G}')
                    
#                     print(f'State: {sbuffer[tau % bn]}')
#                     print(f'Action: {abuffer[tau % bn]}')
                    if off_policy:
#                         print(f'p: {p}')
                        self.Q0[sbuffer[tau % bn], abuffer[tau % bn]] += (learning_rate * p * (G - self.Q0[sbuffer[tau % bn], abuffer[tau % bn]]))
                    else:
                        self.Q0[sbuffer[tau % bn], abuffer[tau % bn]] += (learning_rate * (G - self.Q0[sbuffer[tau % bn], abuffer[tau % bn]]))

#                     print(f'Q0 updated to {self.Q0[sbuffer[tau % bn], abuffer[tau % bn]]}')
#                     print(f'{self.Q0[sbuffer[tau % bn], :]}')
#                     print('--------------------')
                    
                t += 1

    def play(self, env):
        state = env.reset()
        ep_ended = False
                
        env.render()
        while not ep_ended:
            if self.Q1 is None:
                action = self.action(state, epsilon=0)
            else:
                action = self.action_d(state, epsilon=0)

            s_, r, ep_ended, prob = env.step(action)
            state = s_
            
            env.render()

In [155]:
t = TaxiPolicy()
t.train_nstep(env, iterations=30000, epsilon=0.9, learning_rate=0.001, discount=0.9, n=4, nsigma=[1, 1, 1, 0], off_policy=True)

In [156]:
print(t.Q0)
t.play(env)

tensor([[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [-6.7681, -6.7628, -6.9802, -6.7725, -6.7567, -6.8397],
        [-6.2789, -6.2204, -6.2027, -6.3028, -5.7554, -6.5589],
        ...,
        [-4.8371, -4.3556, -5.0090, -5.1344, -6.3495, -6.2179],
        [-6.5064, -6.4800, -6.4577, -6.5177, -7.4614, -6.5569],
        [-0.1596, -0.2511, -0.6823, -0.1408, -0.2863, -4.7454]])
+---------+
|[35mR[0m: | : :G|
| : :[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+

+---------+
|[35mR[0m: | : :G|
| : : :[43m [0m: |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : :[43m [0m: |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : |[43m [0m: |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |[34;1m[43mB[0m[0m: |
+---------+
  (Sout