In [1]:
import numpy as np
import random
from random import sample, randint, random
import tensorflow.contrib.slim as slim
import itertools as it
from time import time, sleep
import skimage.color
import skimage.transform
import tensorflow as tf
from tqdm import trange
from vizdoom import *

  return f(*args, **kwds)


In [2]:
class ReplayMemory:
    def __init__(self, memory_cap, batch_size, resolution, trace_length):

        state_shape = (memory_cap, resolution[0], resolution[1], resolution[2])
        self.s1 = np.zeros(state_shape, dtype=np.float32)
        self.s2 = np.zeros(state_shape, dtype=np.float32)
        self.a = np.zeros(memory_cap, dtype=np.int32)
        self.pa = np.zeros(memory_cap, dtype=np.int32)
        self.r = np.zeros(memory_cap, dtype=np.float32)
        self.d = np.zeros(memory_cap, dtype=np.float32)

        self.memory_cap = memory_cap
        self.batch_size = batch_size
        self.trace_length = trace_length
        self.index = 0
        self.size = 0

    def add_transition(self, s1, a, pa, r, s2, d):

        self.s1[self.index, :, :, :] = s1
        self.a[self.index] = a
        self.pa[self.index] = pa
        self.r[self.index] = r
        self.s2[self.index, :, :, :] = s2
        self.d[self.index] = d

        self.index = (self.index+1) % self.memory_cap
        self.size = min(self.size + 1, self.memory_cap)

    def get_transition(self):
        indexes = []
        for _ in range(self.batch_size):
            accepted = False
            while not accepted:
                point = np.random.randint(0, self.size - self.trace_length)
                accepted = True
                for i in range(self.trace_length-1):
                    if self.d[point+i] > 0:
                        accepted = False
                        break
                if accepted:
                    for i in range(self.trace_length):
                        indexes.append(point+i)

        return self.s1[indexes], self.a[indexes], self.pa[indexes], self.r[indexes], self.s2[indexes], self.d[indexes]

In [3]:
class Network:
    def __init__(self, session, action_count, resolution, lr, batch_size, trace_length, hidden_size, scope):
        self.session = session
        self.resolution = resolution
        self.train_batch_size = batch_size
        self.trace_length_size = trace_length

        self.state = tf.placeholder(tf.float32, shape=[None, resolution[0], resolution[1], resolution[2]])

        conv1 = slim.conv2d(inputs=self.state, num_outputs=32, kernel_size=[8, 8], stride=[4, 4],
                            activation_fn=tf.nn.relu, padding='VALID', scope=scope+'_c1')

        conv2 = slim.conv2d(inputs=conv1, num_outputs=64, kernel_size=[4, 4], stride=[2, 2],
                            activation_fn=tf.nn.relu, padding='VALID', scope=scope+'_c2')

        conv3 = slim.conv2d(inputs=conv2, num_outputs=64, kernel_size=[3, 3], stride=[1, 1],
                            activation_fn=tf.nn.relu, padding='VALID', scope=scope+'_c3')

        flat_obs = slim.flatten(conv3)
        
        #####################################
        # 动作的embedding
        self.prev_action = tf.placeholder(shape=[None], dtype=tf.int32)
        self.prev_action_onehot = tf.one_hot(self.prev_action, action_count, dtype=tf.float32)
        fc_action = slim.fully_connected(self.prev_action_onehot, 512, activation_fn=None, scope=scope)
        flat_act = slim.flatten(fc_action)
        #####################################
        
        self.cell = tf.contrib.rnn.BasicLSTMCell(num_units=hidden_size, state_is_tuple=True)
        self.train_length = tf.placeholder(dtype=tf.int32)
        self.batch_size = tf.placeholder(dtype=tf.int32, shape=[])

        self.fc_reshape_obs = tf.reshape(flat_obs, [self.batch_size, self.train_length, hidden_size])
        #####################################
        # reshpe动作，并把动作和观察链接
        self.fc_reshape_act = tf.reshape(flat_act, [self.batch_size, self.train_length, 512])
        self.fc_concat = tf.concat([self.fc_reshape_obs, self.fc_reshape_act], 2)
        print('链接之后：', self.fc_concat)
        #####################################
        
        self.state_in = self.cell.zero_state(self.batch_size, tf.float32)
        self.rnn, self.rnn_state = tf.nn.dynamic_rnn(inputs=self.fc_concat, cell=self.cell, dtype=tf.float32,
                                                     initial_state=self.state_in, scope=scope+'_rnn')
        self.rnn = tf.reshape(self.rnn, shape=[-1, hidden_size])

        self.q = slim.fully_connected(self.rnn, action_count, activation_fn=None)

        self.best_a = tf.argmax(self.q, 1)

        self.target_q = tf.placeholder(shape=[None], dtype=tf.float32)
        self.actions = tf.placeholder(shape=[None], dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.actions, action_count, dtype=tf.float32)
        self.q_chosen = tf.reduce_sum(tf.multiply(self.q, self.actions_onehot), axis=1)

        self.loss = tf.losses.mean_squared_error(self.q_chosen, self.target_q)

        self.optimizer = tf.train.RMSPropOptimizer(learning_rate=lr, momentum=0.95, epsilon=0.01)

        self.train_step = self.optimizer.minimize(self.loss)

    def learn(self, state, target_q, state_in, action, prev_action):
        feed_dict = {self.state: state, self.target_q: target_q, self.train_length: self.trace_length_size, self.prev_action: prev_action,
                     self.batch_size: self.train_batch_size, self.state_in: state_in, self.actions: action}
        l, _ = self.session.run([self.loss, self.train_step], feed_dict=feed_dict)
        return l

    def get_q(self, state, state_in, prev_action):
        return self.session.run(self.q, feed_dict={self.state: state, self.train_length: self.trace_length_size, self.prev_action: prev_action,
                                                   self.batch_size: self.train_batch_size, self.state_in: state_in})

    def get_best_action(self, state, state_in, prev_action):
        return self.session.run([self.best_a, self.rnn_state], feed_dict={self.state: [state], self.train_length: 1, self.prev_action: [prev_action],
                                                                          self.batch_size: 1, self.state_in: state_in})

    def get_cell_state(self, state, state_in, prev_action):
        return self.session.run(self.rnn_state, feed_dict={self.state: [state], self.train_length: 1, self.prev_action: prev_action,
                                                           self.state_in: state_in, self.batch_size: 1})


In [4]:
class Agent:
    def __init__(self, memory_cap, batch_size, resolution, action_count, session,
                 lr, gamma, epsilon_min, epsilon_decay_steps, epsilon_max, trace_length, hidden_size):

        self.model = Network(session=session, action_count=action_count,
                             resolution=resolution, lr=lr, batch_size=batch_size,
                             trace_length=trace_length, hidden_size=hidden_size, scope='main')
        self.target_model = Network(session=session, action_count=action_count,
                                    resolution=resolution, lr=lr, batch_size=batch_size,
                                    trace_length=trace_length, hidden_size=hidden_size, scope='target')

        self.memory = ReplayMemory(memory_cap=memory_cap, batch_size=batch_size,
                                   resolution=resolution, trace_length=trace_length)

        self.batch_size = batch_size

        self.resolution = resolution
        self.action_count = action_count
        self.gamma = gamma
        self.epsilon_min = epsilon_min
        self.epsilon_decay_steps = epsilon_decay_steps
        self.epsilon_max = epsilon_max
        self.hidden_size = hidden_size
        self.trace_length = trace_length

        self.epsilon = epsilon_max
        self.training_steps = 0

        self.epsilon_decrease = (epsilon_max-epsilon_min)/epsilon_decay_steps

        self.min_buffer_size = batch_size*trace_length

        self.state_in = (np.zeros([1, self.hidden_size]), np.zeros([1, self.hidden_size]))

    def add_transition(self, s1, a, pa, r, s2, d):
        self.memory.add_transition(s1, a, pa, r, s2, d)

    def learn_from_memory(self):

        if self.memory.size > self.min_buffer_size:
            state_in = (np.zeros([self.batch_size, self.hidden_size]), np.zeros([self.batch_size, self.hidden_size]))
            s1, a, pa, r, s2, d = self.memory.get_transition()
            inputs = s1
            
            # 要修改这里，变成，上一步的action
            q = np.max(self.target_model.get_q(s2, state_in, pa), axis=1)
            targets = r + self.gamma * (1 - d) * q
            
            # 要修改这里，变成，上一步的action
            self.model.learn(inputs, targets, state_in, a, pa)

    def act(self, state, prev_action, train=True):
        if train:
            self.epsilon = self.explore(self.epsilon)
            if random() < self.epsilon:
                a = self.random_action()
            else:
                a, self.state_in = self.model.get_best_action(state, self.state_in, prev_action)
                a = a[0]
        else:
            a, self.state_in = self.model.get_best_action(state, self.state_in, prev_action)
            a = a[0]
        return a

    def explore(self, epsilon):
        return max(self.epsilon_min, epsilon-self.epsilon_decrease)

    def random_action(self):
        return randint(0, self.action_count - 1)

    def reset_cell_state(self):
        self.state_in = (np.zeros([1, self.hidden_size]), np.zeros([1, self.hidden_size]))

In [5]:
FRAME_REPEAT = 4 # How many frames 1 action should be repeated
UPDATE_FREQUENCY = 4 # How many actions should be taken between each network update

RESOLUTION = (80, 45, 3) # Resolution
BATCH_SIZE = 32 # Batch size for experience replay
LEARNING_RATE = 0.00025 # Learning rate of model
GAMMA = 0.99 # Discount factor

MEMORY_CAP = 10000 # Amount of samples to store in memory

EPSILON_MAX = 0.5 # Max exploration rate
EPSILON_MIN = 0.1 # Min exploration rate
EPSILON_DECAY_STEPS = 2e5 # How many steps to decay from max exploration to min exploration

RANDOM_WANDER_STEPS = 500 # How many steps to be sampled randomly before training starts

TRACE_LENGTH = 8 # How many traces are used for network updates
HIDDEN_SIZE = 768 # Size of the third convolutional layer when flattened

EPOCHS = 1#200 # Epochs for training (1 epoch = 10k training steps and 10 test episodes)
STEPS_PER_EPOCH = 10000 # How actions to be taken per epoch
EPISODES_TO_TEST = 10 # How many test episodes to be run per epoch for logging performance
EPISODE_TO_WATCH = 10 # How many episodes to watch after training is complete

TAU = 0.001 # How much the target network should be updated towards the online network at each update

LOAD_MODEL = False # Load a saved model?
SAVE_MODEL = False # Save a model while training?
SKIP_LEARNING = False # Skip training completely and just watch?

scenario_path = "../../ViZDoom/scenarios/my_way_home.cfg" # Name and path of scenario
model_savefile = "Models/MWH/model" # Name and path of the model
reward_savefile = "Rewards_MWH.txt"

In [None]:
##########################################

def initialize_vizdoom():
    print("Initializing doom...")
    game = DoomGame()
    game.load_config(scenario_path)
    game.set_window_visible(False)
    game.set_mode(Mode.PLAYER)
    game.set_screen_format(ScreenFormat.RGB24)
    game.set_screen_resolution(ScreenResolution.RES_400X225)
    game.init()

    print("Doom initialized.")
    return game

def preprocess(img):
    img = skimage.transform.resize(img, RESOLUTION, mode='constant')
    img = img.astype(np.float32)
    return img

def updateTargetGraph(tfVars,tau):
    total_vars = len(tfVars)
    op_holder = []
    for idx,var in enumerate(tfVars[0:total_vars//2]):
        op_holder.append(tfVars[idx+total_vars//2].assign((var.value()*tau) + ((1-tau)*tfVars[idx+total_vars//2].value())))
    return op_holder

def updateTarget(op_holder,sess):
    for op in op_holder:
        sess.run(op)

def saveScore(score):
    my_file = open(reward_savefile, 'a')  # Name and path of the reward text file
    my_file.write("%s\n" % test_scores.mean())
    my_file.close()

###########################################

In [None]:
game = initialize_vizdoom()

n = game.get_available_buttons_size()
actions = [list(a) for a in it.product([0, 1], repeat=n)]
ACTION_COUNT = len(actions)

gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.33)

SESSION = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

agent = Agent(memory_cap = MEMORY_CAP, batch_size = BATCH_SIZE, resolution = RESOLUTION, action_count = ACTION_COUNT,
            session = SESSION, lr = LEARNING_RATE, gamma = GAMMA, epsilon_min = EPSILON_MIN, trace_length=TRACE_LENGTH,
            epsilon_decay_steps = EPSILON_DECAY_STEPS, epsilon_max=EPSILON_MAX, hidden_size=HIDDEN_SIZE)

saver = tf.train.Saver()

trainables = tf.trainable_variables()

targetOps = updateTargetGraph(trainables, TAU)

if LOAD_MODEL:
    print("Loading model from: ", model_savefile)
    saver.restore(SESSION, model_savefile)
else:
    init = tf.global_variables_initializer()
    SESSION.run(init)

##########################################

if not SKIP_LEARNING:
    time_start = time()
    print("\nFilling out replay memory")
    updateTarget(targetOps, SESSION)

    episode_buffer = []
    agent.reset_cell_state()
    state = preprocess(game.get_state().screen_buffer)
    ######################################
    # 添加了上一步动作，可能还要处理
    prev_action = agent.random_action()
    ######################################
    for _ in trange(RANDOM_WANDER_STEPS, leave=False):
        action = agent.random_action()
        reward = game.make_action(actions[action], FRAME_REPEAT)
        done = game.is_episode_finished()
        if not done:
            state_new = preprocess(game.get_state().screen_buffer)
        else:
            state_new = None

        agent.add_transition(state, action, prev_action, reward, state_new, done)
        state = state_new
        ######################################
        # 添加了上一步动作，可能还要处理
        prev_action = action
        ######################################
        
        if done:
            game.new_episode()
            agent.reset_cell_state()
            state = preprocess(game.get_state().screen_buffer)

    for epoch in range(EPOCHS):
        print("\n\nEpoch %d\n-------" % (epoch + 1))

        train_episodes_finished = 0
        train_scores = []

        print("Training...")
        game.new_episode()

        episode_buffer = []
        agent.reset_cell_state()
        state = preprocess(game.get_state().screen_buffer)
        for learning_step in trange(STEPS_PER_EPOCH, leave=False):
            action = agent.act(state, prev_action)
            reward = game.make_action(actions[action], FRAME_REPEAT)
            done = game.is_episode_finished()
            if not done:
                state_new = preprocess(game.get_state().screen_buffer)
            else:
                state_new = None

            agent.add_transition(state, action, prev_action, reward, state_new, done)
            state = state_new
            ######################################
            # 添加了上一步动作，可能还要处理
            prev_action = action
            ######################################

            if learning_step % UPDATE_FREQUENCY == 0:
                agent.learn_from_memory()
                updateTarget(targetOps, SESSION)

            if done:
                train_scores.append(game.get_total_reward())
                train_episodes_finished += 1
                game.new_episode()
                agent.reset_cell_state()
                state = preprocess(game.get_state().screen_buffer)

        print("%d training episodes played." % train_episodes_finished)
        train_scores = np.array(train_scores)

        print("Results: mean: %.1f±%.1f," % (train_scores.mean(), train_scores.std()),
            "min: %.1f," % train_scores.min(), "max: %.1f," % train_scores.max())

        print("\nTesting...")

        test_scores = []
        for test_step in trange(EPISODES_TO_TEST, leave=False):
            game.new_episode()
            prev_action = agent.random_action()
            agent.reset_cell_state()
            while not game.is_episode_finished():
                state = preprocess(game.get_state().screen_buffer)
                action = agent.act(state, prev_action, train=False)
                prev_action = action
                game.make_action(actions[action], FRAME_REPEAT)
            test_scores.append(game.get_total_reward())

        test_scores = np.array(test_scores)
        print("Results: mean: %.1f±%.1f," % (test_scores.mean(), test_scores.std()),
              "min: %.1f" % test_scores.min(), "max: %.1f" % test_scores.max())

        if SAVE_MODEL:
            saveScore(test_scores.mean())
            saver.save(SESSION, model_savefile)
            print("Saving the network weigths to:", model_savefile)
            if epoch % (EPOCHS/5) == 0 and epoch is not 0:
                saver.save(SESSION, model_savefile, global_step=epoch)

        print("Total ellapsed time: %.2f minutes" % ((time() - time_start) / 60.0))

print("TIME TO WATCH!!")
# Reinitialize the game with window visible
game.close()
game.set_window_visible(False)
game.set_mode(Mode.ASYNC_PLAYER)
game.init()
score = []

for _ in trange(EPISODE_TO_WATCH, leave=False):
    game.new_episode()
    prev_action = agent.random_action()
    agent.reset_cell_state()
    while not game.is_episode_finished():
        state = preprocess(game.get_state().screen_buffer)
        action = agent.act(state, prev_action, train=False)
        prev_action = action
        game.set_action(actions[action])
        for i in range(FRAME_REPEAT):
            game.advance_action()
            done = game.is_episode_finished()
            if done:
                break

    # Sleep between episodes
    sleep(1.0)
    score.append(game.get_total_reward())
score = np.array(score)
game.close()
print("Results: mean: %.1f±%.1f," % (score.mean(), score.std()),
          "min: %.1f" % score.min(), "max: %.1f" % score.max())

Initializing doom...
Doom initialized.
链接之后： Tensor("concat:0", shape=(?, ?, 1280), dtype=float32)
链接之后： Tensor("concat_2:0", shape=(?, ?, 1280), dtype=float32)


  0%|          | 0/500 [00:00<?, ?it/s]


Filling out replay memory


  0%|          | 0/10000 [00:00<?, ?it/s]         



Epoch 1
-------
Training...


  0%|          | 0/10 [00:00<?, ?it/s]              

19 training episodes played.
Results: mean: -0.1±0.3, min: -0.2, max: 1.0,

Testing...


                                               

Results: mean: -0.2±0.0, min: -0.2 max: -0.2
Total ellapsed time: 21.40 minutes
TIME TO WATCH!!


 90%|█████████ | 9/10 [09:11<01:01, 61.31s/it]