In [1]:
import numpy as np
import gym
import time

In [2]:
class TDLearning:
    
    def __init__(self):
        
        # set up the gym environments of Taxi-v2 problem
        self.env = gym.make('Taxi-v3')
        self.n_states,self.n_actions = self.env.observation_space.n, self.env.action_space.n
        
        # record the return of the agent in each step
        self.timestep_reward = []
        self.accumulated_reward = []
        
    # Initialize Q table
    def InitQ(self):
        
        self.Q = np.random.rand(self.n_states,self.n_actions)
    
    # Epsilon greedy method for SARSA
    def EpsilonGreedy(self,epsilon,state,train=False):
        
        if train or np.random.rand()<epsilon:
            action = np.argmax(self.Q[state,:])
        else:
            action = np.random.randint(0,self.n_actions)
        
        return action
    
    def SARSA(self,gamma,alpha,s1,a1,s2,a2,r,done):
        
        # implement the sarsa algorithm (bellman equation)
        if not done:
            self.Q[s1,a1] = self.Q[s1,a1]+alpha*(r+gamma*self.Q[s2,a2]-self.Q[s1,a1])
        
    def QLearning(self,gamma,alpha,s1,a1,s2,r,done):
        
        # In Q learning, no policy to choose action
        a2 = np.argmax(self.Q[s2,:])
        
        # implement the Qlearning algortihm (bellman equation)
        if not done:
            self.Q[s1,a1] = self.Q[s1,a1]+alpha*(r+gamma*self.Q[s2,a2]-self.Q[s1,a1])
            
        return a2
    
    def MainFunction(self,episodes,max_steps,method='sarsa'):
        
        self.InitQ()
        self.episodes_reward = []
        
        epsilon = 0.1
        gamma = 0.9
        alpha = 0.1
        t = 0
        
        for ep in range(episodes):
            
            print('current episode: ',ep)
            
            # sample a start state of the episode
            s1 = self.env.reset()
            # initialize first action in each episode
            # both SARSA and Q-learnin apply epsilon-greedy to initialize
            a1 = self.EpsilonGreedy(epsilon,s1)
            
            done = False
            self.total_reward=0
            
            while t<=max_steps:
                
                t+=1
                
                if method=='sarsa':
                    
                    # execute the state-action pair, let MDP flows
                    s2, reward, done, info = self.env.step(a1)
                    a2 = self.EpsilonGreedy(epsilon,s2)
                    
                    # Record the total reward in this episode
                    self.total_reward += reward
                    
                    # Update Q table
                    self.SARSA(gamma,alpha,s1,a1,s2,a2,reward,done)
                    
                if method=='Q-Learning':
                    
                    s2, reward, done, info = self.env.step(a1)
                    a2 = self.QLearning(gamma,alpha,s1,a1,s2,reward,done)
                    
                s1 = s2
                a1 = a2
                
                # if this episode ends, stop the while loop
                if done:
                    break
                    
            self.episodes_reward.append(self.total_reward)