In [1]:
# Simple env test.
import json
import select
import time
import logging
import os

import gym
import snake_gym
import minerl
import random

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import math
from collections import deque

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class dqfd_network():
    def __init__(self):
        self.state = tf.placeholder(shape=[None,64,64,3], dtype=tf.float32)
        self.conv1 = tf.layers.conv2d(inputs=self.state, filters=32, kernel_size=[8,8], strides=[4,4], 
                                      padding='VALID', activation=tf.nn.relu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(uniform=False),
                                      kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5),
                                      bias_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5))
        self.conv2 = tf.layers.conv2d(inputs=self.conv1, filters=64, kernel_size=[4,4], strides=[2,2], 
                                      padding='VALID', activation=tf.nn.relu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(uniform=False),
                                      kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5),
                                      bias_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5))
        self.conv3 = tf.layers.conv2d(inputs=self.conv2, filters=64, kernel_size=[3,3], strides=[1,1], 
                                      padding='VALID', activation=tf.nn.relu,
                                      kernel_initializer=tf.contrib.layers.xavier_initializer(uniform=False),
                                      kernel_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5),
                                      bias_regularizer=tf.contrib.layers.l2_regularizer(scale=10e-5))
        self.flat = tf.layers.flatten(self.conv3)
        self.out = tf.layers.dense(self.flat, 11, activation=tf.nn.softmax)
        self.predict = tf.argmax(self.out, 1)

        self.is_expert = tf.placeholder(shape=[None], dtype=tf.int32)
        
        self.action = tf.placeholder(shape=[None], dtype=tf.int32)
        self.actions_onehot = tf.one_hot(self.action, 11, dtype=tf.float32)
        self.Q = tf.reduce_sum(tf.multiply(self.out, self.actions_onehot), axis=1)

        self.targetQ = tf.placeholder(shape=[None], dtype=tf.float32)
        self.td_error = tf.square(self.targetQ - self.Q)
        
        #self.reg_loss = tf.reduce_sum([tf.reduce_mean(reg_l) for reg_l in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)])
        
        self.loss = tf.reduce_mean(self.td_error)
        self.train_step = tf.train.AdamOptimizer(0.01).minimize(self.loss)

In [3]:
def converter(observation):
    region_size = 8
    obs = observation['pov']
    obs = obs / 255
    compass_angle = observation['compassAngle']

    compass_angle_scale = 180
    compass_scaled = compass_angle / compass_angle_scale
    compass_channel = np.ones(shape=list(obs.shape[:-1]) + [1], dtype=obs.dtype) * compass_scaled
    obs = np.concatenate([obs, compass_channel], axis=-1)

    return obs

In [4]:
def main():
    annealing_episodes = 100
    startE = 0.1
    endE = 0.1
    e = startE
    stepDrop = (startE - endE) / annealing_episodes
    
    network = dqfd_network()

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    init = tf.global_variables_initializer()
    sess.run(init)
    
    variables = tf.trainable_variables(scope=None)
    saver = tf.train.Saver(variables, max_to_keep=5)
    
    model_path = '/media/kimbring2/Steam1/MineRL/model/MineRLNavigate-v0'
    ckpt = tf.train.get_checkpoint_state(model_path)
    #saver.restore(sess, ckpt.model_checkpoint_path)
    
    episodeBuffer = deque()
    total_steps = 0
    rList = []
    
    data_tree = minerl.data.make('MineRLTreechop-v0', data_dir='/media/kimbring2/6224AA7924AA5039/minerl_data')
    
    demo_step = 0
    
    # Get expert data
    for state, action, reward, next_state, done in data_tree.sarsd_iter(num_epochs=500, max_sequence_len=2000):
        demo_step += 1
        #print("demo_step: " + str(demo_step))
        
        length = (state['pov'].shape)[0]
        #print("length: " + str(length))
        
        s_batch = []
        a_batch = []
        d_batch = []
        r_batch = []
        s1_batch = []
        e_batch = []
        for i in range(0, length):
            #print("s.shape: " + str(s.shape))
            
            action_index = 11
            camera_threshols = (abs(action['camera'][i][0]) + abs(action['camera'][i][1])) / 2.0
            if (camera_threshols > 2.5):
                if ( (action['camera'][i][1] < 0) & ( abs(action['camera'][i][0]) < abs(action['camera'][i][1]) ) ):
                    if (action['attack'][i] == 0):
                        action_index = 0
                    else:
                        action_index = 1
                elif ( (action['camera'][i][1] > 0) & ( abs(action['camera'][i][0]) < abs(action['camera'][i][1]) ) ):
                    if (action['attack'][i] == 0):
                        action_index = 2
                    else:
                        action_index = 3
                elif ( (action['camera'][i][0] < 0) & ( abs(action['camera'][i][0]) > abs(action['camera'][i][1]) ) ):
                    if (action['attack'][i] == 0):
                        action_index = 4
                    else:
                        action_index = 5
                elif ( (action['camera'][i][0] > 0) & ( abs(action['camera'][i][0]) > abs(action['camera'][i][1]) ) ):
                    if (action['attack'][i] == 0):
                        action_index = 6
                    else:
                        action_index = 7
            elif (action['forward'][i] == 1):
                if (action['attack'][i] == 1):
                    action_index = 8
                elif (action['jump'][i] == 1):
                    action_index = 9
            else:
                action_index = 10
                
            if (action_index == 11):
                continue
            
            s = state['pov'][i] / 255.0
            s_batch.append(s)
            
            a_batch.append(action_index)
            
            r_batch.append(reward[i])

            s1 = next_state['pov'][i] / 255.0
            s1_batch.append(s1)
            
            d_batch.append(done[i].astype(int))
            e_batch.append(1) 
        
        Q1 = sess.run(network.out, feed_dict={network.state:s1_batch})
        end_multiplier = -((np.array(d_batch)) - 1)
        
        #print("a_batch: " + str(a_batch))
        #print("Q1.shape: " + str(Q1.shape))
        #print("Q1[a_batch].shape: " + str(Q1[a_batch].shape))
        
        targetQ = []
        for k in range(0, len(Q1)):
            Q1[k, a_batch[k]] += 0.8 
            targetQ.append(r_batch[k] + 0.99 * np.max(Q1[k]) * end_multiplier[k])
        
        #Q1[:,a_batch] += 0.8 
        #targetQ = r_batch + 0.99 * np.max(Q1, axis=1) * end_multiplier

        print("train network")
        _ = sess.run(network.train_step, feed_dict={network.state:s_batch, 
                                                    network.targetQ:targetQ,
                                                    network.action:a_batch,
                                                    network.is_expert:e_batch})
        
        if demo_step == 100:
            break
        
    
    # Train DQN
    env = gym.make("MineRLTreechop-v0")
    for i in range(annealing_episodes):
        # Reset environment and get first new observation
        obs = env.reset()
        s = obs['pov'] / 255.0

        d = False
        rAll = 0
        steps = 0

        if e > endE:
            e -= stepDrop
        
        # The Q-Network
        while True:
            steps += 1
            total_steps += 1
            
            # Choose an action by greedily (with e chance of random action) from the Q-network
            if np.random.rand(1) < e:
                action_index = np.random.randint(0,11)
            else:
                action_index = sess.run(network.predict, feed_dict={network.state:[s]})[0]
            
            action = env.action_space.noop()
            if (action_index == 0):
                action['camera'] = [0, -2,5]
            elif (action_index == 1):
                action['camera'] = [0, -2,5]
                action['attack'] = 1
            elif (action_index == 2):
                action['camera'] = [0, 2.5]
            elif (action_index == 3):
                action['camera'] = [0, 2.5]
                action['attack'] = 1
            elif (action_index == 4):
                action['camera'] = [-2.5, 0]
            elif (action_index == 5):
                action['camera'] = [-2.5, 0]
                action['attack'] = 1
            elif (action_index == 6):
                action['camera'] = [2.5, 0]
            elif (action_index == 7):
                action['camera'] = [2.5, 0]
                action['attack'] = 1
            elif (action_index == 8):
                action['forward'] = 1
                action['attack'] = 1
            elif (action_index == 9):
                action['forward'] = 1
                action['jump'] = 1
            else:
                action['attack'] = 1
            #action['jump'] = 1
            
            obs1, r, d, _ = env.step(action)
            s1 = obs1['pov'] / 255.0
            
            episodeBuffer.append((s,action_index,r,s1,d))
            if len(episodeBuffer) > 50000:
                episodeBuffer.popleft()

            #if total_steps % 500 == 0:
            #    saver.save(sess, model_path + '/model-' + str(total_steps) + '.cptk')
                
            batch_size = 512
            if total_steps % (batch_size) == 0:
                trainBatch = random.sample(episodeBuffer, batch_size)

                s_batch = [d[0] for d in trainBatch]
                a_batch = [d[1] for d in trainBatch]
                d_batch = [d[4] for d in trainBatch]
                d_batch = (np.array(d_batch)).astype(int)

                r_batch = [d[2] for d in trainBatch]
                s1_batch = [d[3] for d in trainBatch]
                
                Q1 = sess.run(network.out, feed_dict={network.state:s1_batch})
                end_multiplier = -(d_batch - 1)
                targetQ = r_batch + 0.99 * np.max(Q1, axis=1) * end_multiplier
                
                print("train network")
                _ = sess.run(network.train_step, feed_dict={network.state:s_batch, 
                                                            network.targetQ:targetQ,
                                                            network.action:a_batch})
            
            rAll += r
            s = s1
            
            if d == True:
                break
        
        #jList.append(j)
        print("rAll: " + str(rAll))
        rList.append(rAll)
        
        if len(rList) % 10 == 0:
            print(i, np.mean(rList[-10:]), e)
    
if __name__ == "__main__":
    main()


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.conv2d instead.
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use keras.layers.dense instead.
Instructions for updating:
Use tf.cast instead.
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
train network
tr

Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
Process ForkPoolWorker-9:
Traceback (most recent call last):
Process ForkPoolWorker-4:
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/pytho

KeyboardInterrupt: 