In [3]:
import numpy as np
import gym
import random
import matplotlib.pyplot as plt
import time
from IPython.display import clear_output
import pickle
from collections import deque

In [4]:
# Import ML libraries
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape, Flatten
from tensorflow.keras.optimizers import Adam

In [66]:
class Agent:
    def __init__(self, env):
        
        # Initializing Agent/Sim parameters
        # This agent will not have a q_table or an alpha value, 
        # since these will be configured in the DNN
        
        self.env = env
        
        self.gamma = 0.96 # discount factor
        self.epsilon = 1. # exploration rate
        self.epsilon_decay = 0.01
        self.epsilon_min = 0.01 
        
        self.alpha = 0.01
        self.alpha_decay = 0.01
        self.action = None
        
        self.n_observation_space = env.observation_space.n
        self.n_action_space = env.action_space.n
        
        self.current_state = env.reset()
        
        # To allow the agent to store (state, action) pairs which are
        # fed to the network for learning
        
        self.memory = deque(maxlen = 100000)
        
        # Let's define the network in a separate method
        self.model = self.buildDQN()        

    def buildDQN(self):
        # Defining the DNN
        model = Sequential()
        model.add(Embedding(500, 10, input_length=1))
        model.add(Reshape((10,)))        
        model.add(Dense(48, input_dim=self.n_observation_space, activation='tanh'))
        model.add(Dense(48, activation='tanh'))
        model.add(Dense(self.n_action_space, activation='linear'))
        model.compile(loss='mse',optimizer=Adam(lr=self.alpha, decay=self.alpha_decay))        
        
        return model
        
    def selectAction(self, state):
        if(random.uniform(0, 1) < self.epsilon):
            return env.action_space.sample()
        
        return np.argmax(self.model.predict(np.array(state).reshape(1)))

    def train(self, batch_size):
        x_batch, y_batch = [], []
        minibatch = random.sample(self.memory, batch_size)
        
        for state, action, reward, next_state, done in minibatch:
            y_target = self.model.predict(state)
            y_target[0][action] = reward if done else reward + \
                                  self.gamma*np.max(self.model.predict(next_state)[0])
            x_batch.append(state[0])
            y_batch.append(y_target[0])
            
        self.model.fit(np.array(x_batch), np.array(y_batch), batch_size=len(x_batch), verbose=0)
        self.epsilon = self.epsilon*self.epsilon_decay if self.epsilon > self.epsilon_min else self.epsilon_min
            
            
    def add_to_memory(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def updateParameters(self):        
        self.epsilon = np.maximum(self.epsilon - 0.05, 0.1)
        self.alpha = np.maximum(self.alpha - 0.05, 0.6)
        
    def reset(self):
        self.__init__(self.env)

In [49]:
class sarsaAgent:
    def __init__(self, enviroment, optimizer):
        
        # Initialize atributes
        self._state_size = enviroment.observation_space.n
        self.n_action_space = enviroment.action_space.n
        self._optimizer = optimizer
        
        self.experience_replay = deque(maxlen=2000)
        
        # Initialize discount and exploration rate
        self.gamma = 0.6
        self.epsilon = 0.1
        
        # Build networks
        self.q_network = self._build_compile_model()
        self.target_network = self._build_compile_model()
        self.align_target_model()

    def store(self, state, action, reward, next_state, terminated):
        self.experience_replay.append((state, action, reward, next_state, terminated))
    
    def _build_compile_model(self):
        model = Sequential()
        model.add(Embedding(self._state_size, 10, input_length=1))
        model.add(Reshape((10,)))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(50, activation='relu'))
        model.add(Dense(self.n_action_space, activation='linear'))
        
        model.compile(loss='mse', optimizer=self._optimizer)
        return model

    def align_target_model(self):
        self.target_network.set_weights(self.q_network.get_weights())
    
    def selectAction(self, state):
        if np.random.rand() <= self.epsilon:
            return enviroment.action_space.sample()
        
        q_values = self.q_network.predict(state)
        return np.argmax(q_values[0])

    def train(self, batch_size):
        minibatch = random.sample(self.experience_replay, batch_size)
        
        for state, action, reward, next_state, terminated in minibatch:
            
            target = self.q_network.predict(state)
            
            if terminated:
                target[0][action] = reward
            else:
                t = self.target_network.predict(next_state)
                target[0][action] = reward + self.gamma * np.amax(t)
            
            self.q_network.fit(state, target, epochs=1, verbose=0)

In [50]:
env = gym.make("Taxi-v3")

In [69]:
dqnTaxiDriver = Agent(env)
dqnTaxiDriver.model.summary()

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 1, 10)             5000      
_________________________________________________________________
reshape_14 (Reshape)         (None, 10)                0         
_________________________________________________________________
dense_41 (Dense)             (None, 48)                528       
_________________________________________________________________
dense_42 (Dense)             (None, 48)                2352      
_________________________________________________________________
dense_43 (Dense)             (None, 6)                 294       
Total params: 8,174
Trainable params: 8,174
Non-trainable params: 0
_________________________________________________________________


In [45]:
optimizer = Adam(learning_rate=0.01)
sarsaTaxiDriver = sarsaAgent(env, optimizer)
sarsaTaxiDriver.q_network.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 1, 10)             5000      
_________________________________________________________________
reshape_9 (Reshape)          (None, 10)                0         
_________________________________________________________________
dense_26 (Dense)             (None, 50)                550       
_________________________________________________________________
dense_27 (Dense)             (None, 50)                2550      
_________________________________________________________________
dense_28 (Dense)             (None, 6)                 306       
Total params: 8,406
Trainable params: 8,406
Non-trainable params: 0
_________________________________________________________________


In [37]:
# Test sim run:

state = env.reset()

for i in range(10):
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    env.render()

    print("Current state: ", action, state)
    time.sleep(0.05)
    clear_output(wait=True)    
    
    if(done):
        break

+---------+
|[35mR[0m: | : :G|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (West)
Current state:  3 252


In [None]:
# Simulation parameters
batch_size = 32
num_of_episodes = 10
timesteps_per_episode = 200

env.reset()
for e in range(num_of_episodes):
    # Reset the enviroment
    state = env.reset()
    
    # Initialize variables
    reward = 0
    done = False

    for timestep in range(timesteps_per_episode):
        # Run Action
        action = dqnTaxiDriver.selectAction(state)
        
        # Take action    
        next_state, reward, done, info = env.step(action) 
        dqnTaxiDriver.add_to_memory(np.array(state).reshape(1), action, reward, \
                                    np.array(next_state).reshape(1), done)
        
        state = next_state
        
        env.render()
        
        print("Current episode number: ", timestep)
        clear_output(wait=True)
        
        if done:
            break
            
        if len(dqnTaxiDriver.memory) > batch_size:
            dqnTaxiDriver.train(batch_size)
            
    if(e % 500 == 0):
        dqnTaxiDriver.model.save("pickled_dqn.hd5")

+---------+
|R: | : :[35mG[0m|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (South)
Current episode number:  26


In [26]:
# Save trained model
dqnTaxiDriver.model.save("pickled_dqn.hd5")

INFO:tensorflow:Assets written to: pickled_dqn.hd5/assets
