In [1]:
# https://rubikscode.net/2019/07/08/deep-q-learning-with-python-and-tensorflow-2-0/

In [2]:
import numpy as np
import random
from IPython.display import clear_output
from collections import deque
import progressbar

In [3]:
import gym
env = gym.make('Taxi-v3').env

In [4]:
print(env)

<TaxiEnv<Taxi-v3>>


In [5]:
env.render()

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |B: |
+---------+



In [6]:
env.render()

+---------+
|[34;1mR[0m: | : :[35mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |B: |
+---------+



In [7]:
env.P[5]

{0: [(1.0, 105, -1, False)],
 1: [(1.0, 5, -1, False)],
 2: [(1.0, 25, -1, False)],
 3: [(1.0, 5, -1, False)],
 4: [(1.0, 5, -10, False)],
 5: [(1.0, 5, -10, False)]}

In [8]:
print('# states: ', env.observation_space.n)
print('# actions: ', env.action_space.n)

# states:  500
# actions:  6


In [9]:
import tensorflow as tf

class DQNAgent:
    def __init__(self, gym_env):
        self._gym_env = gym_env
        self._num_states = gym_env.observation_space.n
        self._num_actions = gym_env.action_space.n
        
        self.replay_buffer = deque(maxlen=2000)
        
        self.gamma = 0.6
        self.epsilon = 0.1
        
        # build two networks
        # deep q-network
        #self.dqn = self._build_mirrored_model()
        self.dqn = self._build_compiled_model()
        # deep target q-network
        self.tdqn = self._build_compiled_model()
        
    def sync_target_network(self):
        self.tdqn.set_weights(self.dqn.get_weights())
        
    def populate_replay_buffer(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))
        
    def _build_model(self):
        model = tf.keras.Sequential()
        model.add(tf.keras.layers.Embedding(self._num_states, 10, input_length=1))
        model.add(tf.keras.layers.Reshape((10,)))
        model.add(tf.keras.layers.Dense(50, activation='relu'))
        model.add(tf.keras.layers.Dense(50, activation='relu'))
        model.add(tf.keras.layers.Dense(self._num_actions, activation='linear'))
        return model
    
    def _compile_model(self, model):
        model.compile(loss='mse', optimizer='adam')
        
    
    def _build_compiled_model(self):
        model = self._build_model()
        self._compile_model(model)
        return model
        
    def _build_mirrored_model(self):
        mirrored_strategy = tf.distribute.MirroredStrategy()
        with mirrored_strategy.scope():
            model = self._build_model()
            self._compile_model(model)
        return model
            
    
    def take_action(self, state):
        # using epsion-greedy approach
        if np.random.rand() <= self.epsilon:
            return self._gym_env.action_space.sample()
        
        q_values = self.dqn.predict(state)
        return np.argmax(q_values[0])
    
    def infer_action(self, state):
        q_values = self.dqn.predict(state)
        return np.argmax(q_values[0])
    
    def train_dqn(self, batch_size):
        minibatch = random.sample(self.replay_buffer, batch_size)
        
        #import pdb; pdb.set_trace()
        for s, a, r, ns, d in minibatch:
            target = self.dqn.predict(s)
            
            if done:
                target[0][action] = r
            else:
                next_state_target = self.tdqn.predict(ns)
                target[0][action] = r + self.gamma * np.amax(next_state_target)
            
            self.dqn.fit(s, target, epochs=1, verbose=0)

In [10]:
batch_size = 256
learning_rate = 0.02
num_of_episodes = 100
timesteps_per_episode = 1000

In [11]:
dqn_agent = DQNAgent(env)

In [12]:
# Training
for epi in range(0, num_of_episodes):
    state = env.reset()
    state = np.reshape(state, [1, 1])
    
    reward = 0
    done = False
    bar = progressbar.ProgressBar(maxval=timesteps_per_episode/10, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
    bar.start()
    
    for timestep in range(timesteps_per_episode):
        action = dqn_agent.take_action(state)
        
        next_state, reward, done, info = env.step(action)
        next_state = np.reshape(next_state, [1, 1])
        dqn_agent.populate_replay_buffer(state, action, reward, next_state, done)
        
        state = next_state
        
        if done:
            print('Done and syncing target network')
            dqn_agent.sync_target_network()
            break
            
        if len(dqn_agent.replay_buffer) % batch_size == 0:
            print('Training dqn')
            dqn_agent.train_dqn(batch_size)
            
        if timestep%10 == 0:
            bar.update(timestep/10 +1)
            
    bar.finish()
    # env.render()
    
    if (epi +1) % 10 == 0:
        print('----------------')
        print('Episode: {}'.format(epi+1))
        env.render()
        print('----------------')
        
    



Training dqn




Training dqn




Training dqn


[==                                                                      ]   3%

Training dqn




Training dqn




Training dqn




Training dqn


[                                                                        ]   1%

----------------
Episode: 10
+---------+
|[35m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)
----------------


[                                                                        ]   1%

----------------
Episode: 20
+---------+
|R: | : :[42mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)
----------------


[                                                                        ]   1%

----------------
Episode: 30
+---------+
|[35m[43mR[0m[0m: | : :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
----------------


[                                                                        ]   1%

----------------
Episode: 40
+---------+
|R: | : :[35mG[0m|
| :[43m [0m| : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (South)
----------------


[                                                                        ]   1%

----------------
Episode: 50
+---------+
|R: | :[43m [0m:[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)
----------------


[                                                                        ]   1%

----------------
Episode: 60
+---------+
|[35mR[0m: |[43m [0m: :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)
----------------


[                                                                        ]   1%

----------------
Episode: 70
+---------+
|R:[42m_[0m| : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
----------------


[                                                                        ]   1%

----------------
Episode: 80
+---------+
|[34;1mR[0m: | : :[35m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
----------------


[                                                                        ]   1%

----------------
Episode: 90
+---------+
|[35mR[0m: | : :[43mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+
  (North)
----------------




----------------
Episode: 100
+---------+
|R:[43m [0m| : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+
  (North)
----------------





In [None]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
num_of_episodes = 100
frames = []

from tqdm import tqdm

for epi in tqdm(range(num_of_episodes)):
    state = env.reset()
    state = np.reshape(state, [1, 1])
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        #import pdb; pdb.set_trace()
        action = dqn_agent.infer_action(state)
        state, reward, done, info = env.step(action)
        state = np.reshape(state, [1, 1])
        frames.append({
            'frame':env.render(mode='ansi'),
            'state':state,
            'action':action,
            'reward':reward,
        })
        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print('Results after {} num_of_episodes:'.format(num_of_episodes))
print('\tAverage timesteps per episode: {}'.format(total_epochs / num_of_episodes))
print('\tAverage penalties per episode: {}'.format(total_penalties / num_of_episodes))

  0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
len(frames)

In [None]:
from IPython.display import clear_output
from time import sleep

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print_a_frame(frame, i)
        sleep(.3)
        
def print_a_frame(frame, idx=None):
    clear_output(wait=True)
    print(frame['frame'])
    if idx:
        print('Timestep: {}'.format(idx+1))
    print('State: {}'.format(frame['state']))
    print('Action: {}'.format(frame['action']))
    print('Reward: {}'.format(frame['reward']))

In [None]:
print_frames(frames)