# Complex Imitation Learning in MineRL
This tutorial contains a simple example of how to build a imitation-learning based agent that can solve the MineRLNavigateDense-v0 environment. For more information about that environment, see this [MineRL Docs](http://minerl.io/docs/environments/index.html#minerlnavigatedense-v0).

For more Imitation Learning algorithms, like a Dagger in Tensorflow, see that Github repo, [Dagger](https://github.com/zsdonghao/Imitation-Learning-Dagger-Torcs).

Parts of this tutorial are based on code by [Arthur Juliani](https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-2-ded33892c724).

In [1]:
from __future__ import division

import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
%matplotlib inline
import matplotlib.pyplot as plt
import math

try:
    xrange = xrange
except:
    xrange = range
    
#env_name = 'MineRLTreechop-v0'
env_name = 'MineRLObtainIronPickaxe-v0'
data_path = '/media/kimbring2/6224AA7924AA5039/minerl_data'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Setting up our Neural Network agent
This time we will be using a Policy neural network that takes observations, passes them through a single hidden layer, and then produces a probability of choosing a left/right movement. To learn more about this network, see [Andrej Karpathy's blog on Policy Gradient networks](http://karpathy.github.io/2016/05/31/rl/).

In [2]:
H_1 = 512
H_2 = 512

tf.reset_default_graph()

state_pov_1 = tf.placeholder(shape=[None,64,64,3], dtype=tf.float32)
state_item_1 = tf.placeholder(shape=[None,18], dtype=tf.float32)
conv1_pov_1 = slim.conv2d(inputs=state_pov_1, num_outputs=3, kernel_size=[2,2], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
conv2_pov_1 = slim.conv2d(inputs=conv1_pov_1, num_outputs=3, kernel_size=[2,2], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
conv3_pov_1 = slim.conv2d(inputs=conv2_pov_1, num_outputs=3, kernel_size=[2,2], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
convFlat_pov_1 = slim.flatten(conv3_pov_1)
FC_pov_1 = slim.fully_connected(convFlat_pov_1, 256, scope='fc_pov_1')
FC_item_1 = slim.fully_connected(state_item_1, 256, scope='fc_item_1')

#print("FC_pov_1: " + str(FC_pov_1))
#print("FC_item_1: " + str(FC_item_1))

convFlat_1 = tf.concat([FC_pov_1, FC_item_1], 1)
#print("convFlat_1: " + str(convFlat_1))

state_pov_2 = tf.placeholder(shape=[None,64,64,3], dtype=tf.float32)
state_item_2 = tf.placeholder(shape=[None,18], dtype=tf.float32)
conv1_pov_2 = slim.conv2d(inputs=state_pov_2, num_outputs=3, kernel_size=[2,2], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
conv2_pov_2 = slim.conv2d(inputs=conv1_pov_2, num_outputs=3, kernel_size=[2,2], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
conv3_pov_2 = slim.conv2d(inputs=conv2_pov_2, num_outputs=3, kernel_size=[2,2], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
convFlat_pov_2 = slim.flatten(conv3_pov_2)
FC_pov_2 = slim.fully_connected(convFlat_pov_2, 256)
FC_item_2 = slim.fully_connected(state_item_2, 256)

convFlat_2 = tf.concat([FC_pov_2, FC_item_2], 1)

W_1 = tf.get_variable("W_1", shape=[H_1,21],
            initializer=tf.contrib.layers.xavier_initializer())
score_1 = tf.matmul(convFlat_1, W_1)
probability_1 = tf.nn.softmax(score_1)
real_action_1 = tf.placeholder(shape=[None,21], dtype=tf.int32)
loss_1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=real_action_1, 
                                                                logits=score_1))
#tf.summary.scalar('loss_1', loss_1)

W_2 = tf.get_variable("W_2", shape=[H_2,20],
            initializer=tf.contrib.layers.xavier_initializer())
score_2 = tf.matmul(convFlat_2, W_2)
probability_2 = tf.nn.softmax(score_2)
real_action_2 = tf.placeholder(shape=[None,20], dtype=tf.int32)
loss_2 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=real_action_2, 
                                                                logits=score_2))

loss = loss_1 + loss_2
#tf.summary.scalar('loss_2', loss_2)
tf.summary.scalar('loss', loss)

#train_step_1 = tf.train.AdamOptimizer(0.0001).minimize(loss_1)
#train_step_2 = tf.train.AdamOptimizer(0.0001).minimize(loss_2)
train_step = tf.train.AdamOptimizer(0.0001).minimize(loss)

merged = tf.summary.merge_all()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



# Train
MineRL package provides a human playing dataset for improving effiency of traning. At first, we are going to train our network by this dataset and use pretrained network for Reinforcement Learning. I assure it will reduce traing time tremendously. 

For more information about that dataset, see this [MineRL Dataset Docs](http://minerl.io/docs/tutorials/data_sampling.html).

### Running the Agent and Environment

In [3]:
import minerl
data = minerl.data.make(env_name, data_path)

init = tf.global_variables_initializer()
restore = False
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    saver = tf.train.Saver(max_to_keep=5)
    train_writer = tf.summary.FileWriter('/home/kimbring2/MineRL/train_summary/' + env_name, sess.graph)
    
    if restore == True:
        path = '/home/kimbring2/MineRL/model/' + env_name
        ckpt = tf.train.get_checkpoint_state(path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    
    episode_count = 0
    for current_state, action, reward, next_state, done in data.sarsd_iter(num_epochs=500, max_sequence_len=200):
        #print("action['attack']: " + str(action['attack']))
        #print("action['place']: " + str(action['place']))
        #print("action['craft']: " + str(action['craft']))
        #print("action['nearbyCraft']: " + str(action['nearbyCraft']))
        #print("action['nearbySmelt']: " + str(action['nearbySmelt']))
        #print("")
        
        #print("current_state['equipped_items']: " + str(current_state['equipped_items']))
        length = (current_state['pov'].shape)[0]

        action1_list = []
        action2_list = []
        states_pov_list = []
        states_item_list =[]
        for i in range(0, length):
            #print("action['camera'][i]: " + str(action['camera'][i]))
            #print("current_state: " + str(current_state))
                
            pov = current_state['pov'][i].astype(np.float32) / 255.0 - 0.5
            
            inventory = current_state['inventory']
            
            coal = inventory['coal'][i]
            cobblestone = inventory['cobblestone'][i]
            crafting_table = inventory['crafting_table'][i]
            dirt = inventory['dirt'][i]
            furnace = inventory['furnace'][i]
            iron_axe = inventory['iron_axe'][i]
            iron_ingot = inventory['iron_ingot'][i]
            iron_ore = inventory['iron_ore'][i]
            iron_pickaxe = inventory['iron_pickaxe'][i]
            log = inventory['log'][i]
            planks = inventory['planks'][i]
            stick = inventory['stick'][i]
            stone = inventory['stone'][i]
            stone_axe = inventory['stone_axe'][i]
            stone_pickaxe = inventory['stone_pickaxe'][i]
            torch = inventory['torch'][i]
            wooden_axe = inventory['wooden_axe'][i]
            wooden_pickaxe = inventory['wooden_pickaxe'][i]

            state_pov = pov
            state_item = np.array([coal, cobblestone, crafting_table, dirt, furnace, iron_axe, iron_ingot,
                                   iron_ore, iron_pickaxe, log, planks, stick, stone, stone_axe, stone_pickaxe,
                                   torch, wooden_axe, wooden_pickaxe])
            #print("state_item: " + str(state_item))
            #state_item = np.reshape(state_item, (1, 18))
            
            if (action['attack'][i] == 1):
                action_ = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0]
                
            elif (action['place'][i] == 0):
                action_ = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0]             
            elif (action['place'][i] == 1):
                action_ = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0]   
            elif (action['place'][i] == 2):
                action_ = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0]            
            elif (action['place'][i] == 3):
                action_ = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0]             
            elif (action['place'][i] == 4):
                action_ = [0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0]             
            elif (action['place'][i] == 5):
                action_ = [0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0]
                
            elif (action['craft'][i] == 0):
                action_ = [0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0]             
            elif (action['craft'][i] == 1):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0]             
            elif (action['craft'][i] == 2):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0]             
            elif (action['craft'][i] == 3):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0]
                
            elif (action['nearbyCraft'][i] == 0):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
                           0]             
            elif (action['nearbyCraft'][i] == 1):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
                           0]             
            elif (action['nearbyCraft'][i] == 2):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
                           0]             
            elif (action['nearbyCraft'][i] == 3):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
                           0]             
            elif (action['nearbyCraft'][i] == 4):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
                           0]             
            elif (action['nearbyCraft'][i] == 5):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
                           0]
            elif (action['nearbyCraft'][i] == 6):   
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
                           0]
                
            elif (action['nearbySmelt'][i] == 0):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
                           0]             
            elif (action['nearbySmelt'][i] == 1):
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
                           0] 
            else: 
                action_ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                           1] 
                             
            #print("abs(action['camera'][i]): " + str(abs(action['camera'][i])))
            #print("abs(action['camera'][i][1]) > abs(action['camera'][i][0])): " + str((abs(action['camera'][i][1]) > abs(action['camera'][i][0]))))
            if ( (action['camera'][i][1] < 0) & (abs(action['camera'][i][1]) > abs(action['camera'][i][0])) & 
                 (abs(action['camera'][i][1]) > 5) ):
                if ( (action['jump'][i] == 0) & (action['forward'][i] == 0) ):
                    action__ = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                elif ( (action['jump'][i] == 0) & (action['forward'][i] == 1) ):
                    action__ = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                elif ( (action['jump'][i] == 1) & (action['forward'][i] == 0) ):
                    action__ = [0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                elif ( (action['jump'][i] == 1) & (action['forward'][i] == 1) ):
                    action__ = [0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            elif ( (action['camera'][i][1] > 0) & (abs(action['camera'][i][1]) > abs(action['camera'][i][0])) &
                   (abs(action['camera'][i][1]) > 5) ):
                if ( (action['jump'][i] == 0) & (action['forward'][i] == 0) ):
                    action__ = [0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                elif ( (action['jump'][i] == 0) & (action['forward'][i] == 1) ):
                    action__ = [0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                elif ( (action['jump'][i] == 1) & (action['forward'][i] == 0) ):
                    action__ = [0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                elif ( (action['jump'][i] == 1) & (action['forward'][i] == 1) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            elif ( (action['camera'][i][0] > 0) & (abs(action['camera'][i][1]) < abs(action['camera'][i][0])) &
                   (abs(action['camera'][i][0]) > 5) ):
                if ( (action['jump'][i] == 0) & (action['forward'][i] == 0) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                elif ( (action['jump'][i] == 0) & (action['forward'][i] == 1) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                elif ( (action['jump'][i] == 1) & (action['forward'][i] == 0) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
                elif ( (action['jump'][i] == 1) & (action['forward'][i] == 1) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
            elif ( (action['camera'][i][0] < 0) & (abs(action['camera'][i][1]) < abs(action['camera'][i][0])) &
                   (abs(action['camera'][i][0]) > 5) ):
                if ( (action['jump'][i] == 0) & (action['forward'][i] == 0) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
                elif ( (action['jump'][i] == 0) & (action['forward'][i] == 1) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 1, 0, 0, 0, 0, 0, 0]
                elif ( (action['jump'][i] == 1) & (action['forward'][i] == 0) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
                elif ( (action['jump'][i] == 1) & (action['forward'][i] == 1) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 1, 0, 0, 0, 0]
            else:
                if ( (action['jump'][i] == 0) & (action['forward'][i] == 0) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 1, 0, 0, 0]
                elif ( (action['jump'][i] == 0) & (action['forward'][i] == 1) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 1, 0, 0]
                elif ( (action['jump'][i] == 1) & (action['forward'][i] == 0) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
                elif ( (action['jump'][i] == 1) & (action['forward'][i] == 1) ):
                    action__ = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
            
            #print("action__: " + str(action__))
            states_pov_list.append(state_pov)                                                            
            states_item_list.append(state_item)
            action1_list.append(action_)
            action2_list.append(action__)                                       
        
        episode_count = episode_count + 1
        
        #print("np.stack(states_item_list, 0).shape: " + str(np.stack(states_item_list, 0).shape))
        
        #state_train = (np.zeros([1,H]), np.zeros([1,H]))
        feed_dict = {state_pov_1:np.stack(states_pov_list, 0),
                     state_item_1:np.stack(states_item_list, 0),
                     state_pov_2:np.stack(states_pov_list, 0),
                     state_item_2:np.stack(states_item_list, 0),
                     real_action_1:np.stack(action1_list, 0),
                     real_action_2:np.stack(action2_list, 0)
                    }
        
        if episode_count % 100 == 0:
            #summary, _, _ = sess.run([merged, train_step_1, train_step_2], feed_dict=feed_dict)
            summary, _ = sess.run([merged, train_step], feed_dict=feed_dict)    
            train_writer.add_summary(summary, episode_count)

        sess.run(train_step, feed_dict=feed_dict)
        #sess.run(train_step_1, feed_dict=feed_dict)
        #sess.run(train_step_2, feed_dict=feed_dict)                                           
        
        if episode_count % 100 == 0:
            model_path = '/home/kimbring2/MineRL/model/' + env_name
            saver.save(sess, model_path + '/model-' + str(episode_count) + '.cptk')
            print("Saved Model")

Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model


Process ForkPoolWorker-1:
Process ForkPoolWorker-2:
Process ForkPoolWorker-3:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 47, in starmapstar
    return list(itertools.starmap(args[0], args[1]))
  File "/home/kimbring2/minerl_env/lib/python3.6/site-packages/minerl/data/data_pipeline.py", line 416, in _load_data_pyfunc
    data_queue.put(batches)
  File "<string>", line 2, in put
  File "/usr/lib/python3.6/multiprocessing/managers.py", line 757, in _callmethod
    kind, result = conn.recv()
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3

KeyboardInterrupt: 

Here we run the neural network agent, and have it act in the CartPole environment.

In [None]:
'''
import minerl
import gym
env = gym.make('MineRLNavigateDense-v0')

obs  = env.reset()
done = False
net_reward = 0

while not done:
    action = env.action_space.noop()

    action['camera'] = [0, -10]
    action['back'] = 0
    action['forward'] = 1
    action['jump'] = 1
    action['attack'] = 1

    obs, reward, done, info = env.step(
        action)

    net_reward += reward
    print("Total reward: ", net_reward)
'''

In [None]:
#import minerl
#data = minerl.data.make('MineRLNavigateDense-v0', '/home/kimbring2/MineRL/data/')

# Test

In [None]:
import minerl
import gym

env = gym.make('MineRLObtainIronPickaxe-v0')
#obs = env.reset()

In [None]:
import random

e = 0.1

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    saver = tf.train.Saver(max_to_keep=5)
    #train_writer = tf.summary.FileWriter('/home/kimbring2/MineRL/train_summary', sess.graph)
    
    print('Loading Model...')
    path = '/home/kimbring2/MineRL/model/' + env_name
    ckpt = tf.train.get_checkpoint_state(path)
    saver.restore(sess, ckpt.model_checkpoint_path)
    
    env.init()
    obs = env.reset()
    net_reward = 0
    while True:
        #print("current_state['inventory']: " + str(current_state['inventory']))
        pov = obs['pov'].astype(np.float32) / 255.0 - 0.5
        inventory = obs['inventory']
        #print("compass: " + str(compass))
                
        coal = inventory['coal']
        cobblestone = inventory['cobblestone']
        crafting_table = inventory['crafting_table']
        dirt = inventory['dirt']
        furnace = inventory['furnace']
        iron_axe = inventory['iron_axe']
        iron_ingot = inventory['iron_ingot']
        iron_ore = inventory['iron_ore']
        iron_pickaxe = inventory['iron_pickaxe']
        log = inventory['log']
        planks = inventory['planks']
        stick = inventory['stick']
        stone = inventory['stone']
        stone_axe = inventory['stone_axe']
        stone_pickaxe = inventory['stone_pickaxe']
        torch = inventory['torch']
        wooden_axe = inventory['wooden_axe']
        wooden_pickaxe = inventory['wooden_pickaxe']
                
        state_pov = pov
        state_item = np.array([coal, cobblestone, crafting_table, dirt, furnace, iron_axe, iron_ingot,
                               iron_ore, iron_pickaxe, log, planks, stick, stone, stone_axe, stone_pickaxe,
                               torch, wooden_axe, wooden_pickaxe])
            
        action = env.action_space.noop()
        action1_probability = sess.run(probability_1, feed_dict={state_pov_1:[state_pov],
                                                                 state_item_1:[state_item],
                                                                 state_pov_2:[state_pov],
                                                                 state_item_2:[state_item]})
        if np.random.rand(1) >= e:
            action1_index = np.argmax(action1_probability)
        else:  
            action1_index = random.randint(0,21)
        
        if (action1_index == 0):
            action['attack'] = 1; action['place'] = 'none'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
        
        elif (action1_index == 1):
            action['attack'] = 0; action['place'] = 'dirt'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
        elif (action1_index == 2):
            action['attack'] = 0; action['place'] = 'stone'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
        elif (action1_index == 3):
            action['attack'] = 0; action['place'] = 'cobblestone'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
        elif (action1_index == 4):
            action['attack'] = 0; action['place'] = 'crafting_table'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
        elif (action1_index == 5):
            action['attack'] = 0; action['place'] = 'furnace'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
        elif (action1_index == 6):
            action['attack'] = 0; action['place'] = 'torch'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
                
        elif (action1_index == 7):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'torch'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
        elif (action1_index == 8):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'stick'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
        elif (action1_index == 9):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'planks'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
        elif (action1_index == 10):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'crafting_table'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
                
        elif (action1_index == 11):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'wooden_axe'; action['nearbySmelt'] = 'none'
        elif (action1_index == 12):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'wooden_pickaxe'; action['nearbySmelt'] = 'none'
        elif (action1_index == 13):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'stone_axe'; action['nearbySmelt'] = 'none'
        elif (action1_index == 14):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'stone_pickaxe'; action['nearbySmelt'] = 'none'
        elif (action1_index == 15):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'iron_axe'; action['nearbySmelt'] = 'none'
        elif (action1_index == 16):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'iron_pickaxe'; action['nearbySmelt'] = 'none'
        elif (action1_index == 17):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'furnace'; action['nearbySmelt'] = 'none'
                
        elif (action1_index == 18):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'iron_ingot'
        elif (action1_index == 19):
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'coal'
        else:
            action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
            action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
                

        action2_probability = sess.run(probability_2, feed_dict={state_pov_1:[state_pov],
                                                                 state_item_1:[state_item],
                                                                 state_pov_2:[state_pov],
                                                                 state_item_2:[state_item]})
        if np.random.rand(1) >= e:
            action2_index = np.argmax(action2_probability)   
        else:    
            action2_index = random.randint(0,20)
        
        if (action2_index == 0):
            action['camera'][0] = 0; action['camera'][1] = -10; action['forward'] = 0; action['jump'] = 0; 
        elif (action2_index == 1):
            action['camera'][0] = 0; action['camera'][1] = -10; action['forward'] = 0; action['jump'] = 1; 
        elif (action2_index == 2):
            action['camera'][0] = 0; action['camera'][1] = -10; action['forward'] = 1; action['jump'] = 0; 
        elif (action2_index == 3):
            action['camera'][0] = 0; action['camera'][1] = -10; action['forward'] = 1; action['jump'] = 1; 
                
        elif (action2_index == 4):
            action['camera'][0] = 0; action['camera'][1] = 10; action['forward'] = 0; action['jump'] = 0; 
        elif (action2_index == 5):
            action['camera'][0] = 0; action['camera'][1] = 10; action['forward'] = 0; action['jump'] = 1; 
        elif (action2_index == 6):
            action['camera'][0] = 0; action['camera'][1] = 10; action['forward'] = 1; action['jump'] = 0; 
        elif (action2_index == 7):
            action['camera'][0] = 0; action['camera'][1] = 10; action['forward'] = 1; action['jump'] = 1;  
                
        elif (action2_index == 8):
            action['camera'][0] = 10; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 0; 
        elif (action2_index == 9):
            action['camera'][0] = 10; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 1; 
        elif (action2_index == 10):
            action['camera'][0] = 10; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 0; 
        elif (action2_index == 11):
            action['camera'][0] = 10; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 1;              
                
        elif (action2_index == 12):
            action['camera'][0] = -10; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 0; 
        elif (action2_index == 13):
            action['camera'][0] = -10; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 1; 
        elif (action2_index == 14):
            action['camera'][0] = -10; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 0; 
        elif (action2_index == 15):
            action['camera'][0] = -10; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 1;  
                
        elif (action2_index == 16):
            action['camera'][0] = 0; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 0; 
        elif (action2_index == 17):
            action['camera'][0] = 0; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 1; 
        elif (action2_index == 18):
            action['camera'][0] = 0; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 0; 
        elif (action2_index == 19):
            action['camera'][0] = 0; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 1;                  
                
        #print("action: " + str(action))
        #print("action['attack']: " + str(action['attack']))
        action['back'] = 0
        action['left'] = 0
        action['right'] = 0
        action['sprint'] = 0

        obs1, reward, done, info = env.step(action)
        
        if done == True:
            break

        net_reward += reward
    print("Total reward: ", net_reward)

As you can see, the network not only does much better than random actions, but achieves the goal of 200 points per episode, thus solving the task!