In [None]:
# https://github.com/simoninithomas/Deep_reinforcement_learning_Course/blob/master/Deep%20Q%20Learning/Doom/Deep%20Q%20learning%20with%20Doom.ipynb

# Setup environment

In [None]:
import tensorflow as tf
import numpy as np
from vizdoom import *
from datetime import datetime
import random
import time
from skimage import transform

from collections import deque
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
with tf.device('/gpu:0'):
    a = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3], name='a')
    b = tf.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[3, 2], name='b')
    c = tf.matmul(a, b)

with tf.Session() as sess:
    print (sess.run(c))

In [None]:
with tf.Session() as sess:
  devices = sess.list_devices()

In [None]:
print(devices)

In [None]:
def create_environment():
    game = DoomGame()
    game.load_config("scenarios/basic.cfg")
    game.set_window_visible(False)
    
    game.init()
    
    left = [1, 0, 0]
    right = [0, 1, 0]
    shoot = [0, 0, 1]
    possible_actions = [left, right, shoot]
    
    return game, possible_actions

In [None]:
def create_visible_environment():
    game = DoomGame()
    game.load_config("scenarios/basic.cfg")
    
    game.init()
    
    left = [1, 0, 0]
    right = [0, 1, 0]
    shoot = [0, 0, 1]
    possible_actions = [left, right, shoot]
    
    return game, possible_actions

In [None]:
def test_environment(game, actions):
    episodes = 10
    for i in range(episodes):
        game.new_episode()
        while not game.is_episode_finished():
            state = game.get_state()
            img = state.screen_buffer
            misc = state.game_variables
            action = random.choice(actions)
            # print(action)
            reward = game.make_action(action)
            # print ("\treward:", reward)
            #time.sleep(0.007)
        print ("Result:", game.get_total_reward())
        #time.sleep(1)
    game.close()

In [None]:
game, possible_actions = create_environment()

In [None]:
#test_environment(game, possible_actions)

# Util functions

In [None]:
def preprocess_frame(frame):
    cropped_frame = frame[30:-10,30:-30]
    
    normalized_frame = cropped_frame/255.0
    
    resized_frame = transform.resize(normalized_frame, [84,84])
    
    return resized_frame

In [None]:
stack_size = 4

def init_deque():
    # Initialize deque with zero-images one array for each image
    return deque([np.zeros((84,84), dtype=np.int) for i in range(stack_size)], maxlen=4)

stacked_frames = init_deque()

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess_frame(state)
    
    if is_new_episode:
        # Clear our stacked_frames
        stacked_frames = init_deque()
        
        # Because we're in a new episode, copy the same frame 4x.
        for _ in range(4):
            stacked_frames.append(frame)
            
    else:
        # Append frame to deque, automatically removes the oldest frame
        stacked_frames.append(frame)
        
    # Stack the frames.
    stacked_state = np.stack(stacked_frames, axis=2)
    
    return stacked_state, stacked_frames


# Setup Hyperparams

In [None]:
### MODEL HYPERPARAMETERS
state_size = [84, 84, 4]
action_size = game.get_available_buttons_size()
learning_rate = 0.0002

### TRAINING HYPERPARAMETERS
total_episodes = 500
max_steps = 100
batch_size = 64

# Exploration parameters for epsilon greedy strategy
explore_start = 1.0
explore_stop = 0.01
decay_rate = 0.0001

# Q learning hyperparameters
gamma = 0.95 # Discounting rate

### MEMORY HYPERPARAMETERS
pretrain_length = batch_size
memory_size = 1000000

### MODIFY THIS TO FALSE IF YOU JUST WANT TO SEE THE TRAINED AGENT
training = True

## TURN THIS TO TRUE IF YOU WANT TO RENDER THE ENVIRONMENT
episode_render = False

# Setup Network

In [None]:
tempArray = [None]
for i in state_size:
    tempArray.append(i)
    
    
print [None, state_size]
print tempArray


In [None]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            # We create the placeholders
            # *state_size means that we take each elements of state_size in tuple 
            # hence is like if we wrote [None, 84, 84, 4]
            
            
            #self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
            #Created the following to fix the problem:
            tempArray = [None]
            for i in state_size:
                tempArray.append(i)
            self.inputs_ = tf.placeholder(tf.float32, tempArray, name="inputs")
            
            self.actions_ = tf.placeholder(tf.float32, [None, 3], name="actions_")
            # Remember that target_Q is the R(s,a) + ymax Qhat(s', a')
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")
            
            """
            First convnet:
            CNN
            BatchNormalization
            ELU
            """
            # Inpu is 84x84x4
            self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                         filters = 32,
                                         kernel_size = [8,8],
                                         strides = [4,4],
                                         padding = "VALID",
                                         kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = "conv1")
            self.conv1_batchnorm = tf.layers.batch_normalization(inputs = self.conv1,
                                                                training = True,
                                                                epsilon = 1e-5,
                                                                name = "batch_norm1")

            self.conv1_out = tf.nn.elu(self.conv1_batchnorm, name="conv1_out")
            ## --> [20,20,32]
            
            """
            Second convnet:
            CNN
            BatchNormalization
            ELU
            """
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                         filters = 64,
                                         kernel_size = [4,4],
                                         strides = [2,2],
                                         padding = "VALID",
                                         kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = "conv2")
            
            self.conv2_batchnorm = tf.layers.batch_normalization(inputs = self.conv2,
                                                               training = True,
                                                               epsilon = 1e-5,
                                                               name = "batch_norm2")
            
            self.conv2_out = tf.nn.elu(self.conv2_batchnorm, name="conv2_out")
            ## -->[9,9,64]
            
            """
            Third convnet:
            CNN
            BatchNormalization
            ELU
            """
            self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                         filters = 128,
                                         kernel_size=[4,4],
                                         strides = [2,2],
                                         padding = "VALID",
                                         kernel_initializer = tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = "conv3")
            
            self.conv3_batchnorm = tf.layers.batch_normalization(inputs = self.conv3,
                                                                training = True,
                                                                epsilon = 1e-5,
                                                                name = "batch_norm3")
            
            self.conv3_out = tf.nn.elu(self.conv3_batchnorm, name="conv3_out")
            # -->[3,3,128]
            
            self.flatten = tf.layers.flatten(self.conv3_out)
            ## -->[1152]
            
            self.fc = tf.layers.dense(inputs = self.flatten,
                                     units = 512,
                                     activation = tf.nn.elu,
                                     kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                     name="fc1")
            
            self.output = tf.layers.dense(inputs = self.fc,
                                         kernel_initializer = tf.contrib.layers.xavier_initializer(),
                                         units = 3,
                                         activation = None)
            
            # Q is our predicted Q value.
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_), axis=1)
            
            # The loss is the difference between our predicted Q_values and the Q_target
            # Sum(Qtarget - Q)^2
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.RMSPropOptimizer(self.learning_rate).minimize(self.loss)

In [None]:
# Reset the graph
tf.reset_default_graph()

# Instantiate the DQNetwork
DQNetwork = DQNetwork(state_size, action_size, learning_rate) 

# Setup experience replay

In [None]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
        
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

Empty memory will get us into trouble. Here we'll make sure it doesn't happen by taking random actions and storing the experience (state, action, reward, new_state).

In [None]:
# Instantiate memory
memory = Memory(max_size = memory_size)

game.new_episode()

# First we need a state
state = game.get_state().screen_buffer
state, stacked_frames = stack_frames(stacked_frames, state, True)

for i in range(pretrain_length):
    action = random.choice(possible_actions)
    
    reward = game.make_action(action)
    
    done = game.is_episode_finished()
    
    if done:
        next_state = np.zeros(state.shape)
        
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        game.new_episode()
        
        state = game.get_state().screen_buffer
        
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    else:
        next_state = game.get_state().screen_buffer
        next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
        
        memory.add((state, action, reward, next_state, done))
        
        state = next_state

# Set up Tensorboard

To launch tensorboard : tensorboard --logdir=/tensorboard/dqn/1

In [None]:
# Setup TensorBoard Writer
writer = tf.summary.FileWriter("/tensorboard/dqn/1")

## Losses
tf.summary.scalar("Loss", DQNetwork.loss)

write_op = tf.summary.merge_all()

# Train Agent

In [None]:
def predict_action(explore_start, explore_stop, decay_rate, decay_step, state, actions):
    ## EPSILON GREEDY STRATEGY
    # Choose action a from state s using epsilon greedy.
    ## First we randomize a number
    exp_exp_tradeoff = np.random.rand()
    
    # Here we'll use an improved version of our epsilon greedy strategy
    explore_probability = explore_stop + (explore_start - explore_stop) * np.exp(-decay_rate * decay_step)
    
    if (explore_probability > exp_exp_tradeoff):
        # Exploration.
        action = random.choice(possible_actions)
        
    else:
        # Exploitation.
        #Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
        #FIX:
        temp = [1]
        for i in state.shape:
            temp.append(i)
        Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((temp))})

        
        choice = np.argmax(Qs)
        action = possible_actions[int(choice)]
        
    return action, explore_probability

In [None]:
saver = tf.train.Saver()

In [None]:
training

In [None]:
now = datetime.now()
date_time = now.strftime("%m/%d/%Y, %H:%M:%S")

with open("logs/runningTimeDQN.txt", "a") as myfile:
    myfile.write("Starting:\n")
    myfile.write("date and time:" + date_time + "\n")
    
if training == True:
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        decay_step = 0
        game.init()
        
        for episode in range(total_episodes):
            step = 0
            episode_rewards = []
            game.new_episode()
            state = game.get_state().screen_buffer
            state, stacked_frames = stack_frames(stacked_frames, state, True)
            
            while step < max_steps:
                step += 1
                decay_step += 1
                action, explore_probability = predict_action(explore_start, 
                                                             explore_stop, 
                                                             decay_rate, 
                                                             decay_step, 
                                                             state, 
                                                             possible_actions)
                reward = game.make_action(action)
                episode_rewards.append(reward)
                
                done = game.is_episode_finished()
                
                if done:
                    # The episode ends so no next state
                    next_state = np.zeros((84,84), dtype=np.int)
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    
                    # Set step = max_steps to end the episode
                    step = max_steps
                    
                    total_reward = np.sum(episode_rewards)
                    
                    #print f"Episode: {episode}",
                    #     f"Total reward: {total_reward}",
                    #     f"Training loss: {loss}",
                    #     f"Explore P: {explore_probability}" 
                        
                    print "Episode: %f  Total reward: %f  Training loss: %f  Explore P: %f  " %(episode,total_reward,loss,explore_probability)    
                        
                    
                    memory.add((state, action, reward, next_state, done))
                else:
                    next_state = game.get_state().screen_buffer
                    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
                    memory.add((state, action, reward, next_state, done))
                    state = next_state
                    
                ### LEARNING PART
                # Obtain random mini-batch from memory
                batch = memory.sample(batch_size)
                states_mb = np.array([each[0] for each in batch], ndmin=3)
                actions_mb = np.array([each[1] for each in batch])
                rewards_mb = np.array([each[2] for each in batch])
                next_states_mb = np.array([each[3] for each in batch], ndmin=3)
                dones_mb = np.array([each[4] for each in batch])
                
                target_Qs_batch = []
                
                # Get Q values for next_state
                Qs_next_state = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: next_states_mb})
                
                # Set Q_target = r if the episode ends at s+1, otherwise set Q_target = r + gamma*(maxQ(s', a')
                for i in range(len(batch)):
                    terminal = dones_mb[i]
                    
                    # If we are in terminal state, only equals reward
                    if terminal:
                        target_Qs_batch.append(rewards_mb[i])
                    
                    else:
                        target = rewards_mb[i] + gamma * np.max(Qs_next_state[i])
                        target_Qs_batch.append(target)
                        
                targets_mb = np.array([each for each in target_Qs_batch])
                    
                loss, _ = sess.run([DQNetwork.loss, DQNetwork.optimizer],
                                      feed_dict={DQNetwork.inputs_: states_mb,
                                                DQNetwork.target_Q: targets_mb,
                                                DQNetwork.actions_: actions_mb})
                    
                # Write TF Summaries
                summary = sess.run(write_op, feed_dict={DQNetwork.inputs_: states_mb,
                                                       DQNetwork.target_Q: targets_mb,
                                                       DQNetwork.actions_: actions_mb})
                writer.add_summary(summary, episode)
                writer.flush()
                    
            # Save model every 5 episodes
            if episode % 5 == 0:
                save_path = saver.save(sess, "./models/modelDQN.ckpt")
                print("Model saved.")
                now = datetime.now()
                date_time = now.strftime("%m/%d/%Y, %H:%M:%S")
                with open("logs/runningTimeDQN.txt", "a") as myfile:
                    myfile.write("date and time:" + date_time + "\n")

# Watch Agent

In [None]:
with tf.Session() as sess:
    game, possible_actions = create_visible_environment()
    
    total_score = 0
    
    # Load model.
    saver.restore(sess, "./models/modelDQN.ckpt")
    
    game.init()
    
    episodes = 10
    for i in range(episodes):
        game.new_episode()
    
        state = game.get_state().screen_buffer
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        while not game.is_episode_finished():
            # Choose the biggest Q value (= the best action)
            Qs = sess.run(DQNetwork.output, feed_dict = {DQNetwork.inputs_: state.reshape((1, *state.shape))})
            choice = np.argmax(Qs)
            action = possible_actions[int(choice)]

            game.make_action(action)
            time.sleep(0.1)
            done = game.is_episode_finished()

            if game.is_episode_finished():
                break

            next_state = game.get_state().screen_buffer
            next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
            state = next_state
        score = game.get_total_reward()
        print(f"Score {score}")
    game.close()