### Deep Q Learning

### Import Libraries

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
!sudo apt-get update
!sudo apt-get install cmake libboost-all-dev libsdl2-dev libfreetype6-dev libgl1-mesa-dev libglu1-mesa-dev libpng-dev libjpeg-dev libbz2-dev libfluidsynth-dev libgme-dev libopenal-dev zlib1g-dev timidity tar nasm wget
!pip install vizdoom

In [4]:
import tensorflow as tf      # Deep Learning library
import numpy as np           # Handle matrices
from vizdoom import *        # Doom Environment

import random                # Handling random number generation
import time                  # Handling time calculation
from skimage import transform# Help us to preprocess the frames

from collections import deque# Ordered collection with ends
import matplotlib.pyplot as plt # Display graphs
import IPython.core.debugger as db

import warnings # This ignore all the warning messages that are normally printed during the training because of skiimage
warnings.filterwarnings('ignore') 

In [5]:
!git clone https://github.com/simoninithomas/Deep_reinforcement_learning_Course.git

!cp "Deep_reinforcement_learning_Course/Deep Q Learning/Doom/basic.cfg" .
!cp "Deep_reinforcement_learning_Course/Deep Q Learning/Doom/basic.wad" .
!cp "Deep_reinforcement_learning_Course/Deep Q Learning/Doom/_vizdoom.ini" .

Cloning into 'Deep_reinforcement_learning_Course'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 891 (delta 9), reused 0 (delta 0), pack-reused 872[K
Receiving objects: 100% (891/891), 230.11 MiB | 38.91 MiB/s, done.
Resolving deltas: 100% (394/394), done.
Checking out files: 100% (216/216), done.


In [72]:
def test_environment():
    game = DoomGame()
    game.load_config("basic.cfg")
    game.set_doom_scenario_path("basic.wad")
    game.set_window_visible(False)
    game.init()
    shoot = [0, 0, 1]
    left = [1, 0, 0]
    right = [0, 1, 0]
    actions = [shoot, left, right]

    episodes = 2
    for i in range(episodes):
        game.new_episode()
        step_i = 0
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(actions)
            #print(action)
            reward = game.make_action(action)
            #print ("\treward:", reward)
            time.sleep(0.02)
            step_i += 1
        print ("Result:", game.get_total_reward(), step_i)
        time.sleep(2)
    game.close()

test_environment()

Result: -250.0 281
Result: -385.0 300


In [None]:
import matplotlib.pyplot as plt

def show_environment():
    game = DoomGame()
    game.load_config("basic.cfg")
    game.set_doom_scenario_path("basic.wad")
    game.set_window_visible(False)
    game.init()
    shoot = [0, 0, 1]
    left = [1, 0, 0]
    right = [0, 1, 0]
    actions = [shoot, left, right]

    episodes = 1
    for i in range(episodes):
        game.new_episode()
        for j in range(10):
            state = game.get_state()
            img = state.screen_buffer
            print(img.shape)
            plt.imshow(img)
            plt.show()

            pimg = preprocess_frame(img)
            print(pimg.shape)
            plt.imshow(pimg)
            plt.show()

            misc = state.game_variables
            action = random.choice(actions)
            #print(action)
            reward = game.make_action(action)
            #print ("\treward:", reward)
            time.sleep(0.02)
        time.sleep(2)
    game.close()

show_environment()

###Stacked Frame

In [32]:
from skimage.color import rgb2gray

class StackFrame():
  def __init__(self, stack_sz, frame_w, frame_h):
    self.stack_sz = stack_sz
    self.frame_w, self.frame_h = frame_w, frame_h

    self.init_stack()

  def init_stack(self):
    stack_sz = self.stack_sz
    frame_w, frame_h = self.frame_w, self.frame_h

    # Initialize deque with zero-images, one array for each image
    self.stacked_frames = deque([np.zeros((frame_w, frame_h), dtype=np.int) for _ in range(stack_sz)], maxlen=stack_sz)

  def preprocess(self, frame):
    # Greyscale frame already done in our vizdoom config
    # x = np.mean(frame,-1)
      
    # Convert to grey scale. Not strictly necessary in this particular example because
    # the vizdoom config file already set to generate grey scale.
    grey_frame = rgb2gray(frame)
      
    # Crop the screen (remove the roof because it contains no information)
    cropped_frame = grey_frame[30:-10,30:-30]
      
    # Normalize Pixel Values
    normalized_frame = cropped_frame/255.0
      
    # Resize
    preprocessed_frame = transform.resize(normalized_frame, [self.frame_w, self.frame_h])
      
    return preprocessed_frame

  #-----------------------------------------------------------------
  # We feed the last four frames to the Q Network, by stacking them with shape
  # (frame_w, frame_h, 4). The previous frames allows the network to reason about
  # motion of objects.
  #
  # Due to the way that Atari renders screens, every other frame may not actually be rendered.
  # This negatively affects our performance, so we take Deepmind's approach of 
  # taking the elementwise-maxima of the last two frames
  #-----------------------------------------------------------------
  def combine_prev(self, preprocessed_frame, is_new_episode):    
    if is_new_episode:
      # Clear our stacked_frames
      self.init_stack()
          
      # Because we're in a new episode, copy the same frame 4x, apply element-wise maxima
      maxframe = np.maximum(preprocessed_frame, preprocessed_frame)
      self.stacked_frames.append(maxframe)
      self.stacked_frames.append(maxframe)
      self.stacked_frames.append(maxframe)
      self.stacked_frames.append(maxframe)
          
    else:
      #Since deque append to right, we can fetch the previous frame
      maxframe=np.maximum(self.stacked_frames[-1], preprocessed_frame)
          
      # Append frame to deque, automatically removes the oldest frame
      self.stacked_frames.append(maxframe)
            
    # Stack the frames
    stacked_state = np.stack(self.stacked_frames, axis=2)
    
    return stacked_state

  def empty_stack(self):
    return np.zeros((self.frame_w, self.frame_h, 4))

In [63]:
class Environment():
  def __init__(self):
    self.game, self.actions = self._create()

  def _create(self):
      game = DoomGame()
      
      # Load the correct configuration
      game.load_config("basic.cfg")
      
      # Load the correct scenario (in our case basic scenario)
      game.set_doom_scenario_path("basic.wad")

      game.set_window_visible(False)
      
      game.init()
      
      # Here our possible actions
      left = [1, 0, 0]
      right = [0, 1, 0]
      shoot = [0, 0, 1]
      possible_actions = [left, right, shoot]
      
      return game, possible_actions

  def OLD_bootstrap(self, sf, exp_replay):
    game = self.game
    max_elements = 3
    num_elements = 0
    while (num_elements < max_elements):
      game.new_episode()

      state = game.get_state()
      frame = state.screen_buffer
      preprocessed_frame = sf.preprocess(frame)
      stacked_state = sf.combine_prev(preprocessed_frame, is_new_episode=True)

      done = False
      while ((num_elements < max_elements) and not done):
        action = random.choice(self.actions)
        reward = game.make_action(action)
        done = game.is_episode_finished()

        if (not done):
          next_state = game.get_state()
          next_frame = next_state.screen_buffer
          next_preprocessed_frame = sf.preprocess(next_frame)
          next_stacked_state = sf.combine_prev(next_preprocessed_frame, is_new_episode=False)
        else:
          next_stacked_state = sf.empty_stack()

        # Package a sample of (state, action, reward, next_state) and add to Experience Replay
        exp_replay.add(stacked_state, action, reward, next_stacked_state, done)
        num_elements += 1

        state = next_state
        stacked_state = next_stacked_state

        time.sleep(0.02)
      time.sleep(2)

### Experience Replay Memory

In [44]:
class Replay():
  def __init__(self, mem_sz, batch_sz):
    self.mem_sz = mem_sz
    self.batch_sz = batch_sz
    self.mem = self._create()

  def _create(self):
    mem = deque(maxlen=self.mem_sz)
    return mem

  def add(self, stacked_state, action, reward, next_stacked_state, done):
    self.mem.append((stacked_state, action, reward, next_stacked_state, done))

  def sample(self):
    batch = random.sample(self.mem, self.batch_sz)
    return np.array(batch)

### Deep Q Network

In [79]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, BatchNormalization, Flatten, Dense, Input
from tensorflow.keras.optimizers import Adam

class E_Greedy():
  def __init__(self, epsilon_start, epsilon_end, actions, qn):
    self.epsilon = self.epsilon_start = epsilon_start
    self.epsilon_end = epsilon_end
    self.actions = actions
    self.num_actions = len(actions)
    self.qn = qn

  def decay(self, i_episode, num_episodes):
    epsilon_decay = (num_episodes - i_episode) / num_episodes
    self.epsilon = self.epsilon_end + (self.epsilon_start - self.epsilon_end) * epsilon_decay

  def pick_action(self, stacked_state):
    if (random.uniform(0,1) < self.epsilon):
      action_idx = random.randint(0, self.num_actions - 1)
    else:
      qvalues = self.qn.predict(stacked_state[None, ...])
      action_idx = np.argmax(qvalues)
    return (action_idx, self.actions[action_idx])

class DQN():
  def __init__(self, lr, gamma):
    self.lr = lr
    self.gamma = gamma

    self.q_network = self.create_qn()
    self.target_network = self.create_qn()
    self.copy_weights()

  def create_qn(self):
    model = Sequential([
      # Input is 84x84x4
      Input(shape=(84, 84, 4)),
      Conv2D(filters=32, kernel_size=(8, 8), strides=(4,4), activation='relu', padding='valid', kernel_initializer='glorot_normal'),
      BatchNormalization(),
      Conv2D(filters=64, kernel_size=(4, 4), strides=(2,2), activation='relu', padding='valid', kernel_initializer='glorot_normal'),
      BatchNormalization(),
      Conv2D(filters=128, kernel_size=(4, 4), strides=(2,2), activation='relu', padding='valid', kernel_initializer='glorot_normal'),
      BatchNormalization(),
      Flatten(),
      Dense(512, activation='relu', kernel_initializer='he_uniform'),
      Dense(3, activation='relu', kernel_initializer='he_uniform')
    ])
    model.compile(loss='mse', optimizer=Adam(lr=self.lr))
    return model

  def train(self, batch):
    batch_sz = batch.shape[0]
    cur_stacked_states = np.stack(batch[:, 0].ravel())
    next_stacked_states = np.stack(batch[:, 3].ravel())

    cur_qvalues = self.q_network.predict(cur_stacked_states)
    next_qvalues = self.target_network.predict(next_stacked_states)

    for i in range(batch_sz):
      _, action_idx, reward, _, done = batch[i]
      if (done):
        cur_qvalues[i, action_idx] += reward
      else:
        cur_qvalues[i, action_idx] += reward + self.gamma * np.max(next_qvalues[i])

    self.q_network.fit(cur_stacked_states, cur_qvalues, batch_size=batch_sz, epochs=1, verbose=0)

  def copy_weights(self):
    self.target_network.set_weights(self.q_network.get_weights())

### Main Loop

In [81]:
def main():
  exp_replay_sz = 200
  batch_sz=3
  frame_w, frame_h = 84, 84
  stack_size = 4 # We stack 4 composite frames in total
  total_steps = 0
  num_copy_steps = 100
  num_pre_populate = 6
  lr, gamma = 0.002, 0.99
  epsilon_start, epsilon_end = 1.0, 0.05
  num_episodes = 2

  sf = StackFrame(stack_size, frame_w, frame_h)
  env = Environment()
  game, actions = env.game, env.actions
  exp_replay = Replay(mem_sz=exp_replay_sz, batch_sz=batch_sz)
  dq_network = DQN(lr, gamma)
  egp = E_Greedy(epsilon_start, epsilon_end, actions, dq_network.q_network)

  for i_episode in range(num_episodes):
    game.new_episode()
    egp.decay(i_episode, num_episodes)

    state = game.get_state()
    frame = state.screen_buffer
    misc = state.game_variables
    preprocessed_frame = sf.preprocess(frame)
    stacked_state = sf.combine_prev(preprocessed_frame, is_new_episode=True)

    done = False
    while not done:
      action_idx, action = egp.pick_action(stacked_state)
      reward = game.make_action(action)
      done = game.is_episode_finished()

      if (not done):
        next_state = game.get_state()
        next_frame = next_state.screen_buffer
        next_preprocessed_frame = sf.preprocess(next_frame)
        next_stacked_state = sf.combine_prev(next_preprocessed_frame, is_new_episode=False)
      else:
        next_stacked_state = sf.empty_stack()

      # Package a sample of (state, action, reward, next_state) and add to Experience Replay
      exp_replay.add(stacked_state, action_idx, reward, next_stacked_state, done)

      if (total_steps >= num_pre_populate):
        # Then pick a batch from Experience Replay and use it for training
        batch = exp_replay.sample()
        dq_network.train(batch)

      state = next_state
      stacked_state = next_stacked_state

      total_steps += 1
      if (total_steps % num_copy_steps == 0):
        dq_network.copy_weights()

      time.sleep(0.02)
    print ('Episode complete ', i_episode, total_steps)
    time.sleep(2)
  game.close()

main()

Episode complete  0 252
Episode complete  1 552
