In [None]:
# Simple env test.
import json
import select
import time
import logging
import os

import gym
import snake_gym
import minerl
import random

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import math

from collections import deque
import logging
#logging.basicConfig(level=logging.DEBUG)

In [None]:
%run parse_current_mission

In [None]:
CURRENT_MISSION = "./mission_xmls/currentMission.xml"
env = gym.make("MineRLNavigate-v0", xml=CURRENT_MISSION)

In [None]:
env.make_interactive(port=6666, realtime=True)

env.reset();

In [None]:
class dqn_network():
    def __init__(self):
        self.state = tf.placeholder(shape=[None,64,64,4], dtype=tf.float32)
        self.conv1 = tf.layers.conv2d(inputs=self.state, filters=32, kernel_size=[8,8], strides=[4,4], 
                                      padding='VALID', activation=tf.nn.relu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(uniform=False),
                                      kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5),
                                      bias_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5))
        self.conv2 = tf.layers.conv2d(inputs=self.conv1, filters=64, kernel_size=[4,4], strides=[2,2], 
                                      padding='VALID', activation=tf.nn.relu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(uniform=False),
                                      kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5),
                                      bias_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5))
        self.conv3 = tf.layers.conv2d(inputs=self.conv2, filters=64, kernel_size=[3,3], strides=[1,1], 
                                      padding='VALID', activation=tf.nn.relu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(uniform=False),
                                      kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5),
                                      bias_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5))
        self.flat = tf.layers.flatten(self.conv3)
        self.out = tf.layers.dense(self.flat, 3, activation=tf.nn.softmax)
        self.predict = tf.argmax(self.out, 1)

        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.action, 3, dtype=tf.float32)
        self.Q = tf.reduce_sum(tf.multiply(self.out, self.actions_onehot), axis=1)

        self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32)
        self.td_error = tf.square(self.targetQ - self.Q)
            
        self.loss = tf.reduce_mean(self.td_error)
        self.train_step = tf.train.AdamOptimizer(0.001).minimize(self.loss)

In [None]:
def converter(observation):
    region_size = 8
    obs = observation['pov']
    obs = obs / 255
    compass_angle = observation['compassAngle']

    compass_angle_scale = 180
    compass_scaled = compass_angle / compass_angle_scale
    compass_channel = np.ones(shape=list(obs.shape[:-1]) + [1], dtype=obs.dtype) * compass_scaled
    obs = np.concatenate([obs, compass_channel], axis=-1)

    return obs

In [None]:
def main():
    #env = gameEnv(partial=False, size=5)
    #env = BattlesnakeGym(number_of_snakes=1, map_size=(10, 10))
    
    annealing_episodes = 100
    startEps = 1.0 # Start with completely random actions
    endEps = 0.1 # End with less random actions
    eps = startEps
    stepDrop = (startEps - endEps) / annealing_episodes
    
    network = dqn_network()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    init = tf.global_variables_initializer()
    sess.run(init)
    
    variables = tf.trainable_variables(scope=None)
    saver = tf.train.Saver(variables, max_to_keep=5)
    
    #model_path = '/media/kimbring2/Steam1/MineRL/model/snake'
    model_path = '/home/jimmy/AI/Coursework/WASP/DQFD_Minecraft-master/checkpoints'
    ckpt = tf.train.get_checkpoint_state(model_path)
    #saver.restore(sess, ckpt.model_checkpoint_path)
    
    episodeBuffer = deque()
    total_steps = 0
    rewardList = []
    for i in range(annealing_episodes):
        # Reset environment and get first new observation
        obs = env.reset()
        state = converter(obs)

        done = False
        total_reward = 0
        steps = 0

        if eps > endEps:
            eps -= stepDrop
        
        # The Q-Network
        while True: # If the agent takes longer than 200 moves to reach either of the blocks, end the trial.
            #env.render()
            #time.sleep(0.5)
            steps += 1
            total_steps += 1
            
            # Choose an action by greedily (with eps chance of random action) from the Q-network
            if np.random.rand(1) < eps:
                action_index = np.random.randint(0,3)
            else:
                action_index = sess.run(network.predict, feed_dict={network.state:[state]})[0]

            #print("action_index: " + str(action_index))
            
            # Seems like action decides whether to change camera angle (left/right?) or move forwards.
            action = env.action_space.noop()
            if (action_index == 0):
                action['camera'] = [0, -5]
            elif (action_index == 1):
                action['camera'] = [0, 5]
            elif (action_index == 2):
                action['forward'] = 1
                  
            # The AI always jumps and attacks.
            action['jump'] = 1
            action['attack'] = 1
            
            obs1, reward, done, _ = env.step(action)
            state1 = converter(obs1)
            
            episodeBuffer.append((state,action_index,reward,state1,done))
            if len(episodeBuffer) > 50000:
                episodeBuffer.popleft()

            if total_steps % 500 == 0:
                saver.save(sess, model_path + '/model-' + str(total_steps) + '.cptk')
                
            batch_size = 512
            if total_steps % (batch_size) == 0:
                trainBatch = random.sample(episodeBuffer, batch_size)

                state_batch  = [done[0] for done in trainBatch]
                action_batch = [done[1] for done in trainBatch]
                done_batch   = [done[4] for done in trainBatch]
                done_batch   = (np.array(done_batch)).astype(int)

                reward_batch = [done[2] for done in trainBatch]
                state1_batch = [done[3] for done in trainBatch]
                
                #allQ = sess.run(network.Q, feed_dict={network.state:[trainBatch[:,0]]})
                Q1 = sess.run(network.out, feed_dict={network.state:state1_batch})
                end_multiplier = -(done_batch - 1)
                targetQ = reward_batch + 0.99 * np.max(Q1, axis=1) * end_multiplier
                
                #print("train network")
                _ = sess.run(network.train_step, feed_dict={network.state:state_batch, 
                                                            network.targetQ:targetQ,
                                                            network.action:action_batch})
            

            total_reward += reward
            state = state1
            
            
            if done == True:
                break
        
        #jList.append(j)
        print("Total reward: " + str(total_reward))
        rewardList.append(total_reward)
        
        if len(rewardList) % 10 == 0:
            print(i, np.mean(rewardList[-10:]), eps)

if __name__ == "__main__":
    main()