In [2]:
import numpy as np
import random
from IPython.display import clear_output
from collections import deque
import progressbar

In [3]:
import gym
env = gym.make('Taxi-v3').env

In [4]:
print(env.render(mode='ansi'))

+---------+
|R: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+




In [5]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print_a_frame(frame, i)
        sleep(.1)
        
def print_a_frame(frame, idx=None):
    clear_output(wait=True)
    print(frame['frame'])
    if idx:
        print('Timestep: {}'.format(idx+1))
    print('State: {}'.format(frame['state']))
    print('Action: {}'.format(frame['action']))
    print('Reward: {}'.format(frame['reward']))

In [6]:
import tensorflow as tf

class DQNAgent:
    def __init__(self, gym_env, epsilon, gamma, alpha):
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = alpha
        self.gym_env = gym_env
        
        self._num_states = gym_env.observation_space.n
        self._num_actions = gym_env.action_space.n
        
        self.replay_buffer = deque(maxlen=200000)
        
        # build two networks
        # deep q-network
        #self.dqn = self._build_mirrored_model()
        self.dqn = self._build_compiled_model()
        # deep target q-network
        self.tdqn = self._build_compiled_model()
        
    def sync_target_network(self):
        self.tdqn.set_weights(self.dqn.get_weights())
        
    def populate_replay_buffer(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))
        
    def _build_model(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Embedding(self._num_states, 10, input_length=1))
        model.add(tf.keras.layers.Reshape((10,)))
        model.add(tf.keras.layers.Dense(50, activation='relu'))
        model.add(tf.keras.layers.Dense(50, activation='relu'))
        model.add(tf.keras.layers.Dense(self._num_actions, activation='linear'))
        return model
    
    def _compile_model(self, model):
        model.compile(loss='mse', optimizer='adam')
        
    
    def _build_compiled_model(self):
        model = self._build_model()
        self._compile_model(model)
        return model
        
    def _build_mirrored_model(self):
        mirrored_strategy = tf.distribute.MirroredStrategy()
        with mirrored_strategy.scope():
            model = self._build_model()
            self._compile_model(model)
        return model
            
    
    def take_action(self, state):
        if random.uniform(0, 1) < self.epsilon:
            action = self.gym_env.action_space.sample() # Explore action space
        else:
            q_values = self.dqn.predict(state)
            action = np.argmax(q_values) # Exploit learned values
        return action
    
    def train_dqn(self, batch_size):
        #import pdb; pdb.set_trace()
        minibatch = random.sample(self.replay_buffer, batch_size)
        
        for s, a, r, ns, d in minibatch:
            target = self.dqn.predict(s)
            
            if done:
                target[0][action] = r
            else:
                next_state_target = self.tdqn.predict(ns)
                target[0][action] = r + self.gamma * np.amax(next_state_target)
            
            self.dqn.fit(s, target, epochs=1, verbose=0)
            
    def infer_action(self, state):
        #import pdb; pdb.set_trace()
        q_values = self.dqn.predict(state)
        action = np.argmax(q_values)
        return action

In [7]:
# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1
num_of_episodes = 10
max_episode_length = 1000
batch_size = 256

In [8]:
#q_learn = QLearning(env, epsilon, gamma, alpha)
dqn_learn = DQNAgent(env, epsilon, gamma, alpha)

In [9]:
%%time
"""Training the agent"""

import random
from tqdm import tqdm
from IPython.display import clear_output

for epi in tqdm(range(num_of_episodes)):
    state = env.reset()
    state = np.reshape(state, [1, 1])
    penalties, reward, = 0, 0
    done = False
    
    episode_length_counter = 0
    while not done:
        if episode_length_counter > max_episode_length:
            # break the episode after max episode length
            break
        episode_length_counter += 1
        
        #get action with epsilon 
        action = dqn_learn.take_action(state)
        next_state, reward, done, info = env.step(action) 
        next_state = np.reshape(next_state, [1, 1])
        dqn_learn.populate_replay_buffer(state, action, reward, next_state, done)
        if reward == -10:
            penalties += 1

        state = next_state
        
        if len(dqn_learn.replay_buffer) % (2*batch_size) == 0:
            print('Training dqn...')
            dqn_learn.train_dqn(batch_size)
            
        if episode_length_counter%100 == 0:
            print('Episode length: {}, state: {}'.format(episode_length_counter, state))
        
            
    # after one episode is done, copy dqn to tdqn
    print('*** Episode done and syncing target network...')
    dqn_learn.sync_target_network()

    if (epi+1) % 10 == 0:
        clear_output(wait=True)
        print('Episode: {}, penalty: {}'.format(epi+1, penalties))

print("Training finished.\n")

  0%|          | 0/10 [00:00<?, ?it/s]

Episode length: 100, state: [[484]]
Episode length: 200, state: [[464]]


  0%|          | 0/10 [00:05<?, ?it/s]


KeyboardInterrupt: 

In [10]:
state = np.reshape(428, [1, 1])
dqn_learn.infer_action(state)

1

In [None]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
num_of_episodes = 100
frames = []

from tqdm import tqdm

for epi in tqdm(range(num_of_episodes)):
    state = env.reset()
    state = np.reshape(state, [1, 1])
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        #import pdb; pdb.set_trace()
        action = dqn_learn.infer_action(state)
        state, reward, done, info = env.step(action)
        state = np.reshape(state, [1, 1])
        frames.append({
            'frame':env.render(mode='ansi'),
            'state':state,
            'action':action,
            'reward':reward,
        })
        
        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print('Results after {} num_of_episodes:'.format(num_of_episodes))
print('\tAverage timesteps per episode: {}'.format(total_epochs / num_of_episodes))
print('\tAverage penalties per episode: {}'.format(total_penalties / num_of_episodes))

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
len(frames)

In [None]:
print_frames(frames)