# Complex Imitation Learning in MineRL
This tutorial contains a simple example of how to build a imitation-learning based agent that can solve the MineRLNavigateDense-v0 environment. For more information about that environment, see this [MineRL Docs](http://minerl.io/docs/environments/index.html#minerlnavigatedense-v0).

For more Imitation Learning algorithms, like a Dagger in Tensorflow, see that Github repo, [Dagger](https://github.com/zsdonghao/Imitation-Learning-Dagger-Torcs).

Parts of this tutorial are based on code by [Arthur Juliani](https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-2-ded33892c724).

In [1]:
from __future__ import division

import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
%matplotlib inline
import matplotlib.pyplot as plt
import math

try:
    xrange = xrange
except:
    xrange = range
    
env_name = 'MineRLTreechop-v0'
#env_name = 'MineRLObtainIronPickaxe-v0'
data_path = '/media/kimbring2/6224AA7924AA5039/minerl_data'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Setting up our Neural Network agent
This time we will be using a Policy neural network that takes observations, passes them through a single hidden layer, and then produces a probability of choosing a left/right movement. To learn more about this network, see [Andrej Karpathy's blog on Policy Gradient networks](http://karpathy.github.io/2016/05/31/rl/).

In [2]:
#Used to initialize weights for policy and value output layers
def normalized_columns_initializer(std=1.0):
    def _initializer(shape, dtype=None, partition_info=None):
        out = np.random.randn(*shape).astype(np.float32)
        out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
        return tf.constant(out)
    return _initializer

In [3]:
H = 1024

tf.reset_default_graph()

state = tf.placeholder(shape=[None,64,64,3], dtype=tf.float32)

with tf.variable_scope('global'):
    conv1 = slim.conv2d(inputs=state, num_outputs=32, kernel_size=[8,8], stride=[4,4], padding='VALID', 
                        biases_initializer=None, activation_fn=tf.nn.relu, weights_initializer=slim.variance_scaling_initializer())
    conv2 = slim.conv2d(inputs=conv1, num_outputs=64, kernel_size=[4,4], stride=[2,2], padding='VALID', 
                        biases_initializer=None, activation_fn=tf.nn.relu, weights_initializer=slim.variance_scaling_initializer())
    conv3 = slim.conv2d(inputs=conv2, num_outputs=64, kernel_size=[3,3], stride=[1,1], padding='VALID', 
                        biases_initializer=None, activation_fn=tf.nn.relu, weights_initializer=slim.variance_scaling_initializer())
convFlat = slim.flatten(conv3)
#print("convFlat: " + str(convFlat))

#with tf.variable_scope('local_1'):
'''
    W_1 = tf.get_variable("W_1", shape=[H,17], initializer=tf.contrib.layers.xavier_initializer())
    score_1 = tf.matmul(convFlat, W_1)
    score_1_mean = tf.reduce_mean(score_1, 1)
    score_1 = score_1 - tf.expand_dims(score_1_mean, 1)
    probability_1 = tf.nn.softmax(score_1)
    real_action_1 = tf.placeholder(shape=[None,17], dtype=tf.int32)
    loss_1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=real_action_1, logits=score_1))
'''
with tf.variable_scope('local_2'):
    W_2 = tf.get_variable("W_2", shape=[H,5], initializer=tf.contrib.layers.xavier_initializer())
    score_2 = tf.matmul(convFlat, W_2)
    score_2_mean = tf.reduce_mean(score_2, 1)
    score_2 = score_2 - tf.expand_dims(score_2_mean, 1)
    probability_2 = tf.nn.softmax(score_2)
    real_action_2 = tf.placeholder(shape=[None,5], dtype=tf.int32)
    loss_2 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=real_action_2, logits=score_2))
    tf.summary.scalar('loss_2', loss_2)
    
with tf.variable_scope('local_3'):
    W_3 = tf.get_variable("W_3", shape=[H,4], initializer=tf.contrib.layers.xavier_initializer())
    score_3 = tf.matmul(convFlat, W_3)
    score_3_mean = tf.reduce_mean(score_3, 1)
    score_3 = score_3 - tf.expand_dims(score_3_mean, 1)
    probability_3 = tf.nn.softmax(score_3)
    real_action_3 = tf.placeholder(shape=[None,4], dtype=tf.int32)
    loss_3 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=real_action_3, logits=score_3))
    tf.summary.scalar('loss_3', loss_3)

with tf.variable_scope('local_4'):
    W_4 = tf.get_variable("W_4", shape=[H,2], initializer=tf.contrib.layers.xavier_initializer())
    score_4 = tf.matmul(convFlat, W_4)
    score_4_mean = tf.reduce_mean(score_4, 1)
    score_4 = score_4 - tf.expand_dims(score_4_mean, 1)
    probability_4 = tf.nn.softmax(score_4)
    real_action_4 = tf.placeholder(shape=[None,2], dtype=tf.int32)
    loss_4 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=real_action_4, logits=score_4))
    tf.summary.scalar('loss_4', loss_4)

optimizer = tf.train.AdamOptimizer(learning_rate=1e-4)  
global_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, 'global')

gradients_2 = optimizer.compute_gradients(loss_2, var_list=global_vars)
gradients_3 = optimizer.compute_gradients(loss_3, var_list=global_vars)
gradients_4 = optimizer.compute_gradients(loss_4, var_list=global_vars)
for i, (grad, var) in enumerate(gradients_2):
    common_net_coeff = 1.0 / 11
    grad *= common_net_coeff
    gradients_2[i] = (tf.clip_by_norm(grad, 40), var)
    
for i, (grad, var) in enumerate(gradients_3):
    common_net_coeff = 1.0 / 11
    grad *= common_net_coeff
    gradients_3[i] = (tf.clip_by_norm(grad, 40), var)
    
for i, (grad, var) in enumerate(gradients_4):
    common_net_coeff = 1.0 / 11
    grad *= common_net_coeff
    gradients_4[i] = (tf.clip_by_norm(grad, 40), var)

apply_grads_2 = optimizer.apply_gradients(gradients_2)
apply_grads_3 = optimizer.apply_gradients(gradients_3)
apply_grads_4 = optimizer.apply_gradients(gradients_4)

merged = tf.summary.merge_all()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.

Instructions for updating:
Use tf.cast instead.


# Train
MineRL package provides a human playing dataset for improving effiency of traning. At first, we are going to train our network by this dataset and use pretrained network for Reinforcement Learning. I assure it will reduce traing time tremendously. 

For more information about that dataset, see this [MineRL Dataset Docs](http://minerl.io/docs/tutorials/data_sampling.html).

### Running the Agent and Environment

In [4]:
import minerl
data = minerl.data.make(env_name, data_path)

init = tf.global_variables_initializer()
restore = False
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    saver = tf.train.Saver(max_to_keep=5)
    train_writer = tf.summary.FileWriter('/home/kimbring2/MineRL_Git/train_summary/' + env_name, sess.graph)
    
    if restore == True:
        path = '/home/kimbring2/MineRL_Git/model/' + env_name
        ckpt = tf.train.get_checkpoint_state(path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    
    episode_count = 0
    for current_state, action, reward, next_state, done in data.sarsd_iter(num_epochs=5000, max_sequence_len=1000):
        #print("current_state['equipped_items']: " + str(current_state['equipped_items']))
        length = (current_state['pov'].shape)[0]
        #print("length: " + str(length))
        if (length != 1000):
            continue
        
        action4_list = []
        #action1_list = []
        action2_list = []
        action3_list = []
        state_list = []
        for i in range(0, length):             
            pov = current_state['pov'][i].astype(np.float32) / 255.0 - 0.5
            '''
            inventory = current_state['inventory']
            
            coal = inventory['coal'][i]
            cobblestone = inventory['cobblestone'][i]
            crafting_table = inventory['crafting_table'][i]
            dirt = inventory['dirt'][i]
            furnace = inventory['furnace'][i]
            iron_axe = inventory['iron_axe'][i]
            iron_ingot = inventory['iron_ingot'][i]
            iron_ore = inventory['iron_ore'][i]
            iron_pickaxe = inventory['iron_pickaxe'][i]
            log = inventory['log'][i]
            planks = inventory['planks'][i]
            stick = inventory['stick'][i]
            stone = inventory['stone'][i]
            stone_axe = inventory['stone_axe'][i]
            stone_pickaxe = inventory['stone_pickaxe'][i]
            torch = inventory['torch'][i]
            wooden_axe = inventory['wooden_axe'][i]
            wooden_pickaxe = inventory['wooden_pickaxe'][i]
            
            coal_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*coal
            cobblestone_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*cobblestone
            crafting_table_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*crafting_table
            dirt_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*dirt
            furnace_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*furnace
            iron_axe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_axe
            iron_ingot_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_ingot
            iron_ore_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_ore
            iron_pickaxe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_pickaxe
            log_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*log
            planks_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*planks
            stick_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stick
            stone_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone
            stone_axe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone_axe
            stone_pickaxe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone_pickaxe
            torch_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*torch
            wooden_axe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*wooden_axe
            wooden_pickaxe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*wooden_pickaxe
            
            state_channel = np.concatenate([pov, coal_channel, cobblestone_channel, crafting_table_channel, dirt_channel, furnace_channel, 
                                            iron_axe_channel, iron_ingot_channel, iron_ore_channel, iron_pickaxe_channel, log_channel, 
                                            planks_channel, stick_channel, stone_channel, stone_axe_channel, stone_pickaxe_channel,
                                            torch_channel, wooden_axe_channel, wooden_pickaxe_channel], axis=-1)
            '''
            if (action['attack'][i] == 0):
                action____ = [1, 0]
            else:
                action____ = [0, 1]
                
            '''
            if (action['place'][i] == 1):
                action_ = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0]   
            elif (action['place'][i] == 2):
                action_ = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0]            
            elif (action['place'][i] == 3):
                action_ = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0]             
            elif (action['place'][i] == 4):
                action_ = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0]             
            elif (action['place'][i] == 5):
                action_ = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0]
            ############################################
            elif (action['craft'][i] == 1):
                action_ = [0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0]             
            elif (action['craft'][i] == 2):
                action_ = [0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0]             
            elif (action['craft'][i] == 3):
                action_ = [0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
                           0, 0, 0, 0, 0, 0, 0]
            ############################################
            elif (action['nearbyCraft'][i] == 1):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
                           0, 0, 0, 0, 0, 0, 0]             
            elif (action['nearbyCraft'][i] == 2):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
                           0, 0, 0, 0, 0, 0, 0]             
            elif (action['nearbyCraft'][i] == 3):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           1, 0, 0, 0, 0, 0, 0]             
            elif (action['nearbyCraft'][i] == 4):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 1, 0, 0, 0, 0, 0]             
            elif (action['nearbyCraft'][i] == 5):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 1, 0, 0, 0, 0]
            elif (action['nearbyCraft'][i] == 6):   
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 1, 0, 0, 0]
            ############################################
            elif (action['nearbySmelt'][i] == 1):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 1, 0, 0] 
            elif (action['nearbySmelt'][i] == 2):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 1, 0]
            else: 
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 1] 
            '''       
            #print("abs(action['camera'][i]): " + str(abs(action['camera'][i])))
            #print("abs(action['camera'][i][1]) > abs(action['camera'][i][0])): " + str((abs(action['camera'][i][1]) > abs(action['camera'][i][0]))))
            if ( (action['camera'][i][1] < 0) & (abs(action['camera'][i][1]) > abs(action['camera'][i][0])) & 
                 (abs(action['camera'][i][1]) > 5) ):
                action__ = [1, 0, 0, 0, 0]
            elif ( (action['camera'][i][1] > 0) & (abs(action['camera'][i][1]) > abs(action['camera'][i][0])) &
                   (abs(action['camera'][i][1]) > 5) ):
                action__ = [0, 1, 0, 0, 0]
            elif ( (action['camera'][i][0] > 0) & (abs(action['camera'][i][1]) < abs(action['camera'][i][0])) &
                   (abs(action['camera'][i][0]) > 5) ):
                action__ = [0, 0, 1, 0, 0]
            elif ( (action['camera'][i][0] < 0) & (abs(action['camera'][i][1]) < abs(action['camera'][i][0])) &
                   (abs(action['camera'][i][0]) > 5) ):
                action__ = [0, 0, 0, 1, 0]
            else:
                action__ = [0, 0, 0, 0, 1]
            
            
            if ( (action['jump'][i] == 0) & (action['forward'][i] == 0) ):
                action___ = [1, 0, 0, 0]
            elif ( (action['jump'][i] == 0) & (action['forward'][i] == 1) ):
                action___ = [0, 1, 0, 0]
            elif ( (action['jump'][i] == 1) & (action['forward'][i] == 0) ):
                action___ = [0, 0, 1, 0]
            elif ( (action['jump'][i] == 1) & (action['forward'][i] == 1) ):
                action___ = [0, 0, 0, 1]
            
            state_list.append(pov)                                                            
            #action1_list.append(action_)
            action2_list.append(action__)
            action3_list.append(action___)
            action4_list.append(action____)
        
        episode_count = episode_count + 1
        
        feed_dict = {state:np.stack(state_list, 0),
                     real_action_2:np.stack(action2_list, 0),
                     real_action_3:np.stack(action3_list, 0),
                     real_action_4:np.stack(action4_list, 0)
                    }
        
        summary = sess.run(merged, feed_dict=feed_dict)  
        train_writer.add_summary(summary, episode_count)

        sess.run([apply_grads_3], feed_dict=feed_dict)   
        #print("action_index_1: " + str(action_index_1))
        
        if episode_count % 10 == 0:
            model_path = '/home/kimbring2/MineRL_Git/model/' + env_name
            saver.save(sess, model_path + '/model-' + str(episode_count) + '.cptk')
            print("Saved Model")

Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model


Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Process ForkPoolWorker-1:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 119, i

KeyboardInterrupt: 

# Test

In [4]:
import minerl
import gym

env = gym.make(env_name)
obs = env.reset()

In [None]:
import random

e = 0.0

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    saver = tf.train.Saver(max_to_keep=5)
    #train_writer = tf.summary.FileWriter('/home/kimbring2/MineRL/train_summary', sess.graph)
    
    #print('Loading Model...')
    #path = '/home/kimbring2/MineRL_Git/model/' + env_name
    #ckpt = tf.train.get_checkpoint_state(path)
    #saver.restore(sess, ckpt.model_checkpoint_path)
    
    env.init()
    obs = env.reset()
    net_reward = 0
    for i in range(0, 500000):
        #print("current_state['inventory']: " + str(current_state['inventory']))
        pov = obs['pov'].astype(np.float32) / 255.0 - 0.5
        #inventory = obs['inventory']
        #print("compass: " + str(compass))
        '''
        coal = inventory['coal']
        cobblestone = inventory['cobblestone']
        crafting_table = inventory['crafting_table']
        dirt = inventory['dirt']
        furnace = inventory['furnace']
        iron_axe = inventory['iron_axe']
        iron_ingot = inventory['iron_ingot']
        iron_ore = inventory['iron_ore']
        iron_pickaxe = inventory['iron_pickaxe']
        log = inventory['log']
        planks = inventory['planks']
        stick = inventory['stick']
        stone = inventory['stone']
        stone_axe = inventory['stone_axe']
        stone_pickaxe = inventory['stone_pickaxe']
        torch = inventory['torch']
        wooden_axe = inventory['wooden_axe']
        wooden_pickaxe = inventory['wooden_pickaxe']
        
        coal_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*coal
        cobblestone_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*cobblestone
        crafting_table_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*crafting_table
        dirt_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*dirt
        furnace_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*furnace
        iron_axe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_axe
        iron_ingot_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_ingot
        iron_ore_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_ore
        iron_pickaxe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_pickaxe
        log_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*log
        planks_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*planks
        stick_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stick
        stone_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone
        stone_axe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone_axe
        stone_pickaxe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone_pickaxe
        torch_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*torch
        wooden_axe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*wooden_axe
        wooden_pickaxe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*wooden_pickaxe
            
        state_channel = np.concatenate([pov, coal_channel, cobblestone_channel, crafting_table_channel, dirt_channel, furnace_channel, 
                                        iron_axe_channel, iron_ingot_channel, iron_ore_channel, iron_pickaxe_channel, log_channel, 
                                        planks_channel, stick_channel, stone_channel, stone_axe_channel, stone_pickaxe_channel,
                                        torch_channel, wooden_axe_channel, wooden_pickaxe_channel], axis=-1)
        '''
        action = env.action_space.noop()
        action2_probability, action3_probability, action4_probability = sess.run([probability_2, 
                                                                                  probability_3,
                                                                                  probability_4], 
                                                                                  feed_dict={state:[pov]})
        
        #print(sess.run([tf.shape(hidden)], feed_dict={state:[state_channel]}))
        
        #print("action1_probability: " + str(action1_probability))
        #print("action2_probability: " + str(action2_probability))
        #print("action3_probability: " + str(action3_probability))
        #print("action4_probability: " + str(action4_probability))
        if np.random.rand(1) >= e:
            action4_index = np.argmax(action4_probability)
        else:  
            action4_index = random.randint(0,1)
        
        #print("action4_index: " + str(action4_index))
        if (action4_index == 0):
            action['attack'] = 0
        else:
            action['attack'] = 1
        '''
        if np.random.rand(1) >= e:
            action1_index = np.argmax(action1_probability)
        else:  
            action1_index = random.randint(0,20)
         
        if (action1_index == 0):
            action['place'] = 1; action['craft'] = 0; 
            action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
        elif (action1_index == 1):
            action['place'] = 2; action['craft'] = 0; 
            action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
        elif (action1_index == 2):
            action['place'] = 3; action['craft'] = 0; 
            action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
        elif (action1_index == 3):
            action['place'] = 4; action['craft'] = 0; 
            action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
        elif (action1_index == 4):
            action['place'] = 5; action['craft'] = 0; 
            action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
                
        elif (action1_index == 5):
            action['place'] = 0; action['craft'] = 1; 
            action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
        elif (action1_index == 6):
            action['place'] = 0; action['craft'] = 2; 
            action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
        elif (action1_index == 7):
            action['place'] = 0; action['craft'] = 3; 
            action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
        elif (action1_index == 8):
            action['place'] = 0; action['craft'] = 4; 
            action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
                
        elif (action1_index == 9):
            action['place'] = 0; action['craft'] = 0; 
            action['nearbyCraft'] = 1; action['nearbySmelt'] = 0
        elif (action1_index == 10):
            action['place'] = 'none'; action['craft'] = 0; 
            action['nearbyCraft'] = 2; action['nearbySmelt'] = 0
        elif (action1_index == 11):
            action['place'] = 0; action['craft'] = 0; 
            action['nearbyCraft'] = 3; action['nearbySmelt'] = 0
        elif (action1_index == 12):
            action['place'] = 0; action['craft'] = 0; 
            action['nearbyCraft'] = 4; action['nearbySmelt'] = 0
        elif (action1_index == 13):
            action['place'] = 0; action['craft'] = 0; 
            action['nearbyCraft'] = 5; action['nearbySmelt'] = 0
        elif (action1_index == 14):
            action['place'] = 0; action['craft'] = 0; 
            action['nearbyCraft'] = 6; action['nearbySmelt'] = 0
        elif (action1_index == 15):
            action['place'] = 0; action['craft'] = 0; 
            action['nearbyCraft'] = 7; action['nearbySmelt'] = 0
                
        elif (action1_index == 16):
            action['place'] = 0; action['craft'] = 0; 
            action['nearbyCraft'] = 0; action['nearbySmelt'] = 1
        elif (action1_index == 17):
            action['attack'] = 0; action['place'] = 0; action['craft'] = 0; 
            action['nearbyCraft'] = 0; action['nearbySmelt'] = 2
        elif (action1_index == 18):
            action['attack'] = 0; action['place'] = 0; action['craft'] = 0; 
            action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
        '''
        if np.random.rand(1) >= e:
            action2_index = np.argmax(action2_probability)   
        else:    
            action2_index = random.randint(0,5)
        
        if (action2_index == 0):
            action['camera'][0] = 0; action['camera'][1] = -5;  
        elif (action2_index == 1):
            action['camera'][0] = 0; action['camera'][1] = 5; 
        elif (action2_index == 2):
            action['camera'][0] = 5; action['camera'][1] = 0; 
        elif (action2_index == 3):
            action['camera'][0] = -5; action['camera'][1] = 0;        
        elif (action2_index == 4):
            action['camera'][0] = 0; action['camera'][1] = 0;
            
        if np.random.rand(1) >= e:
            action3_index = np.argmax(action3_probability)   
        else:    
            action3_index = random.randint(0,4)
            
        #print("action3_index: " + str(action3_index))
        if (action3_index == 0):
            action['forward'] = 0; action['jump'] = 0; 
        elif (action3_index == 1):
            action['forward'] = 1; action['jump'] = 0; 
        elif (action3_index == 2):
            action['forward'] = 0; action['jump'] = 1;        
        elif (action3_index == 3):
            action['forward'] = 1; action['jump'] = 1;            
        
        #action['camera'][0] = 5
        #action['camera'][1] = 5
        #action['forward'] = 1
        #print("action: " + str(action))
        #print("action['attack']: " + str(action['attack']))
        #if (action['attack'] == 1):
        #    action['jump'] = 0
        
        action['back'] = 0
        action['left'] = 0
        action['right'] = 0
        action['sprint'] = 0
        
        #print("action: " + str(action))
        obs1, reward, done, info = env.step(action)
        
        if done == True:
            break
        
        obs = obs1
        net_reward += reward
    print("Total reward: ", net_reward)

As you can see, the network not only does much better than random actions, but achieves the goal of 200 points per episode, thus solving the task!