In [1]:
import numpy as np
import gym
import random

In [6]:
class QLearningAgent:
    def __init__(self, env, learning_rate=0.7, discount_rate=0.9, max_epsilon=1.0, min_epsilon=0.01, decay_rate=0.005):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_rate = discount_rate
        self.max_epsilon = max_epsilon
        self.min_epsilon = min_epsilon
        self.decay_rate = decay_rate
        self.num_episodes = 1000
        self.max_steps = 99
        self.epsilon = max_epsilon  # Initialize epsilon

        self.q_table = np.zeros((env.observation_space.n, env.action_space.n))
    def epsilon_greedy_policy(self, state):
        exp_exp_tradeoff = random.uniform(0, 1)

        if exp_exp_tradeoff > self.epsilon:
            action = np.argmax(self.q_table[state, :])
        else:
            action = self.env.action_space.sample()
        return action

    def train_agent(self):
        for episode in range(self.num_episodes):
            state = self.env.reset()[0]
            done = False

            for s in range(self.max_steps):
                action = self.epsilon_greedy_policy(state)
                new_state, reward, done, truncated, info = self.env.step(action)

                # Q-learning algorithm
                self.q_table[state, action] = self.q_table[state, action] + \
                    self.learning_rate * (reward + self.discount_rate * np.max(self.q_table[new_state, :]) -
                                          self.q_table[state, action])

                state = new_state

                if done:
                    break

            self.epsilon = self.min_epsilon + (self.max_epsilon - self.min_epsilon) * np.exp(-self.decay_rate * episode)

    def show_trained_agent(self):
        self.env = gym.make('Taxi-v3', render_mode = "human")
        state = self.env.reset()[0]
        done = False
        rewards = 0

        for s in range(self.max_steps):
            print("TRAINED AGENT")
            print("Step {}".format(s+1))

            action = np.argmax(self.q_table[state, :])
            new_state, reward, done, truncated, info = self.env.step(action)
            rewards += reward

            print(f"score: {rewards}")
            state = new_state

            if done:
                break

        self.env.close()



In [7]:
if __name__ == "__main__":
    env = gym.make('Taxi-v3')
    agent = QLearningAgent(env)
    agent.train_agent()
    agent.show_trained_agent()

  if not isinstance(terminated, (bool, np.bool8)):


TRAINED AGENT
Step 1


  if not isinstance(terminated, (bool, np.bool8)):


score: -1
TRAINED AGENT
Step 2
score: -2
TRAINED AGENT
Step 3
score: -3
TRAINED AGENT
Step 4
score: -4
TRAINED AGENT
Step 5
score: -5
TRAINED AGENT
Step 6
score: -6
TRAINED AGENT
Step 7
score: -7
TRAINED AGENT
Step 8
score: -8
TRAINED AGENT
Step 9
score: -9
TRAINED AGENT
Step 10
score: -10
TRAINED AGENT
Step 11
score: 10
