# Simple Reinforcement Learning in MineRL
This tutorial contains a simple example of how to build a Reinforcement Learning based agent that can solve the MineRLNavigateDense-v0 environment. We use the network learned in the previous code [MineRL Imitation Learning](https://github.com/kimbring2/MineRL/blob/master/MineRL_IL_Recurrent.ipynb).

In [1]:
from __future__ import division

import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
%matplotlib inline
import matplotlib.pyplot as plt
import math

try:
    xrange = xrange
except:
    xrange = range
    
#env_name = 'MineRLNavigateDense-v0'
env_name = 'MineRLObtainIronPickaxe-v0'
data_path = '/media/kimbring2/6224AA7924AA5039/minerl_data'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Setting up our Neural Network agent
This time we will be using a Convolutional Neural Network that takes observations, passes them through a single hidden layer, and then produces a probability of choosing a Jump and Camera movement. To learn more about this network, see [Convolutional Neural Networks for Visual Recognition Course](http://cs231n.stanford.edu/).

In [2]:
tf.reset_default_graph()
H = 1024
state = tf.placeholder(shape=[None,64,64,21], dtype=tf.float32)

In [3]:
conv1 = slim.conv2d( \
            inputs=state,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', 
                    biases_initializer=None,activation_fn=tf.nn.relu)
conv2 = slim.conv2d( \
            inputs=conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', 
                    biases_initializer=None,activation_fn=tf.nn.relu)
conv3 = slim.conv2d( \
            inputs=conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', 
                    biases_initializer=None,activation_fn=tf.nn.relu)
convFlat = slim.flatten(conv3)

score_1 = slim.fully_connected(convFlat, 19, activation_fn=None, biases_initializer=None)
probability_1 = tf.nn.softmax(score_1)
act1_stochastic = tf.multinomial(tf.log(probability_1), num_samples=1)
act1_stochastic = tf.reshape(act1_stochastic, shape=[-1])

score_2 = slim.fully_connected(convFlat, 6, activation_fn=None, biases_initializer=None)
probability_2 = tf.nn.softmax(score_2)
act2_stochastic = tf.multinomial(tf.log(probability_2), num_samples=1)
act2_stochastic = tf.reshape(act2_stochastic, shape=[-1])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use tf.random.categorical instead.


In [4]:
conv1_old = slim.conv2d( \
            inputs=state,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', 
                    biases_initializer=None,activation_fn=tf.nn.relu, scope='conv1_old')
conv2_old = slim.conv2d( \
            inputs=conv1_old,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', 
                    biases_initializer=None,activation_fn=tf.nn.relu, scope='conv2_old')
conv3_old = slim.conv2d( \
            inputs=conv2_old,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', 
                    biases_initializer=None,activation_fn=tf.nn.relu, scope='conv3_old')
convFlat_old = slim.flatten(conv3_old)

score_1_old = slim.fully_connected(convFlat_old, 19, activation_fn=None, biases_initializer=None, scope='score_1_old')
probability_1_old = tf.nn.softmax(score_1_old)

score_2_old = slim.fully_connected(convFlat_old, 6, activation_fn=None, biases_initializer=None, scope='score_2_old')
probability_2_old = tf.nn.softmax(score_2_old)

In [5]:
conv1_value = slim.conv2d( \
                inputs=state,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', 
                        biases_initializer=None,activation_fn=tf.nn.relu, scope='conv1_value')
conv2_value = slim.conv2d( \
                inputs=conv1_value,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', 
                        biases_initializer=None,activation_fn=tf.nn.relu, scope='conv2_value')
conv3_value = slim.conv2d( \
                inputs=conv2_value,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', 
                        biases_initializer=None,activation_fn=tf.nn.relu, scope='conv3_value')
convFlat_value = slim.flatten(conv3_value)
value_preds = slim.fully_connected(convFlat_value, 1, activation_fn=None, biases_initializer=None, scope='fc_value')

In [6]:
clip_value = 0.2
c_1 = 1
c_2 = 0.01

action1_holder = tf.placeholder(dtype=tf.int32, shape=[None], name='action1_holder')
action2_holder = tf.placeholder(dtype=tf.int32, shape=[None], name='action2_holder')
rewards_holder = tf.placeholder(dtype=tf.float32, shape=[None], name='rewards_holder')
v_preds_next_holder = tf.placeholder(dtype=tf.float32, shape=[None], name='v_preds_next_holder')
gaes_holder = tf.placeholder(dtype=tf.float32, shape=[None], name='gaes_holder')

act1_probs = probability_1
act2_probs = probability_2

act1_probs_old = probability_1_old
act2_probs_old = probability_2_old

# probabilities of actions which agent took with policy
act1_probs = act1_probs * tf.one_hot(indices=action1_holder, depth=act1_probs.shape[1])
act2_probs = act2_probs * tf.one_hot(indices=action2_holder, depth=act2_probs.shape[1])

act1_probs = tf.reduce_sum(act1_probs, axis=1)
act2_probs = tf.reduce_sum(act2_probs, axis=1)

# probabilities of actions which agent took with old policy
act1_probs_old = act1_probs_old * tf.one_hot(indices=action1_holder, depth=act1_probs_old.shape[1])
act2_probs_old = act2_probs_old * tf.one_hot(indices=action2_holder, depth=act2_probs_old.shape[1])

act1_probs_old = tf.reduce_sum(act1_probs_old, axis=1)
act2_probs_old = tf.reduce_sum(act2_probs_old, axis=1)
##########################################################################################################
ratios1 = tf.exp(tf.log(act1_probs) - tf.log(act1_probs_old))
clipped1_ratios = tf.clip_by_value(ratios1, clip_value_min = 1 - clip_value, clip_value_max = 1 + clip_value)
loss1_clip = tf.minimum(tf.multiply(gaes_holder, ratios1), tf.multiply(gaes_holder, clipped1_ratios))
loss1_clip = tf.reduce_mean(loss1_clip)

ratios2= tf.exp(tf.log(act2_probs) - tf.log(act2_probs_old))
clipped2_ratios = tf.clip_by_value(ratios2, clip_value_min = 1 - clip_value, clip_value_max = 1 + clip_value)
loss2_clip = tf.minimum(tf.multiply(gaes_holder, ratios2), tf.multiply(gaes_holder, clipped2_ratios))
loss2_clip = tf.reduce_mean(loss2_clip)

##########################################################################################################
v_preds = value_preds
loss_vf = tf.squared_difference(rewards_holder + 0.95 * v_preds_next_holder, v_preds)
loss_vf = tf.reduce_mean(loss_vf)
##########################################################################################################
entropy1 = -tf.reduce_sum(probability_1 * tf.log(tf.clip_by_value(probability_1, 1e-10, 1.0)), axis=1)
entropy1 = tf.reduce_mean(entropy1, axis=0)  # mean of entropy of pi(obs)

entropy2 = -tf.reduce_sum(probability_2 * tf.log(tf.clip_by_value(probability_2, 1e-10, 1.0)), axis=1)
entropy2 = tf.reduce_mean(entropy2, axis=0)  # mean of entropy of pi(obs)

##########################################################################################################
loss1 = loss1_clip - c_1 * loss_vf + c_2 * entropy1
loss1 = -loss1  # minimize -loss == maximize loss

loss2 = loss2_clip - c_1 * loss_vf + c_2 * entropy2
loss2 = -loss2  # minimize -loss == maximize loss

loss = loss1 + loss2
##########################################################################################################
variables = slim.get_variables_to_restore()
pi_trainable = [v for v in variables if v.name.split('/')[0] not in ['conv1_value', 'conv2_value', 'conv3_value', 'fc_value',
                                                                     'conv1_old', 'conv2_old', 'conv3_old', 
                                                                     'score_1_old', 'score_2_old']]
optimizer = tf.train.AdamOptimizer(learning_rate=0.00001, epsilon=1e-5)
train_op = optimizer.minimize(loss, var_list=pi_trainable)

Instructions for updating:
Use tf.cast instead.


# Train

In [7]:
import minerl
import gym
import os
import random
import copy
from env_wrappers import ContinuingTimeLimitMonitor

env = gym.make(env_name)

e = 0.01
update_frequency = 5

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    for i in range(0, 5000):
        total_reward = []
        total_length = []
    
        env.init()
        obs = env.reset()
        
        net_reward = 0
        observations = []
        actions1 = []
        actions2 = []
        rewards = []
        v_preds = []
        reward = 0
        for j in range(0, 4500):
            #print("current_state['inventory']: " + str(current_state['inventory']))
            pov = obs['pov'].astype(np.float32) / 255.0 - 0.5
            inventory = obs['inventory']
            #print("compass: " + str(compass))
        
            coal = inventory['coal']
            cobblestone = inventory['cobblestone']
            crafting_table = inventory['crafting_table']
            dirt = inventory['dirt']
            furnace = inventory['furnace']
            iron_axe = inventory['iron_axe']
            iron_ingot = inventory['iron_ingot']
            iron_ore = inventory['iron_ore']
            iron_pickaxe = inventory['iron_pickaxe']
            log = inventory['log']
            planks = inventory['planks']
            stick = inventory['stick']
            stone = inventory['stone']
            stone_axe = inventory['stone_axe']
            stone_pickaxe = inventory['stone_pickaxe']
            torch = inventory['torch']
            wooden_axe = inventory['wooden_axe']
            wooden_pickaxe = inventory['wooden_pickaxe']
          
            coal_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*coal
            cobblestone_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*cobblestone
            crafting_table_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*crafting_table
            dirt_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*dirt
            furnace_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*furnace
            iron_axe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_axe
            iron_ingot_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_ingot
            iron_ore_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_ore
            iron_pickaxe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_pickaxe
            log_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*log
            planks_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*planks
            stick_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stick
            stone_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone
            stone_axe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone_axe
            stone_pickaxe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone_pickaxe
            torch_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*torch
            wooden_axe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*wooden_axe
            wooden_pickaxe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*wooden_pickaxe
            
            state_channel = np.concatenate([pov, coal_channel, cobblestone_channel, crafting_table_channel, dirt_channel, furnace_channel, 
                                            iron_axe_channel, iron_ingot_channel, iron_ore_channel, iron_pickaxe_channel, log_channel, 
                                            planks_channel, stick_channel, stone_channel, stone_axe_channel, stone_pickaxe_channel,
                                            torch_channel, wooden_axe_channel, wooden_pickaxe_channel], axis=-1)
        
            action = env.action_space.noop()
            action1_index, action2_index, v_pred = sess.run([act1_stochastic, act2_stochastic, value_preds], 
                                                             feed_dict={state:[state_channel]})
            
            #print("action1_index: " + str(action1_index))
            #print("action2_index: " + str(action2_index))
            #print("")
            if (action1_index == 0):
                action['place'] = 1; action['craft'] = 0;
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 1):
                action['place'] = 2; action['craft'] = 0;
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 2):
                action['place'] = 3; action['craft'] = 0;
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 3):
                action['place'] = 4; action['craft'] = 0;
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 4):
                action['place'] = 5; action['craft'] = 0;
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 5):
                action['place'] = 0; action['craft'] = 1; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 6):
                action['place'] = 0; action['craft'] = 2;
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 7):
                action['place'] = 0; action['craft'] = 3;
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 8):
                action['place'] = 0; action['craft'] = 4;
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 9):
                action['place'] = 0; action['craft'] = 0;
                action['nearbyCraft'] = 1; action['nearbySmelt'] = 0
            elif (action1_index == 10):
                action['place'] = 0; action['craft'] = 0;
                action['nearbyCraft'] = 2; action['nearbySmelt'] = 0
            elif (action1_index == 11):
                action['place'] = 0; action['craft'] = 0;
                action['nearbyCraft'] = 3; action['nearbySmelt'] = 0
            elif (action1_index == 12):
                action['place'] = 0; action['craft'] = 0;
                action['nearbyCraft'] = 4; action['nearbySmelt'] = 0
            elif (action1_index == 13):
                action['place'] = 0; action['craft'] = 0;
                action['nearbyCraft'] = 5; action['nearbySmelt'] = 0
            elif (action1_index == 14):
                action['place'] = 0; action['craft'] = 0;
                action['nearbyCraft'] = 6; action['nearbySmelt'] = 0
            elif (action1_index == 15):
                action['place'] = 0; action['craft'] = 0;
                action['nearbyCraft'] = 7; action['nearbySmelt'] = 0
            elif (action1_index == 16):
                action['place'] = 0; action['craft'] = 0;
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 1
            elif (action1_index == 17):
                action['place'] = 0; action['craft'] = 0;
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 2
            elif (action1_index == 18):
                action['place'] = 0; action['craft'] = 0;
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            
            
            if (action2_index == 0):
                action['camera'][0] = 0; action['camera'][1] = -1; action['forward'] = 0; action['jump'] = 0; 
                action['attack'] = 1
            elif (action2_index == 1):
                action['camera'][0] = 0; action['camera'][1] = 1; action['forward'] = 0; action['jump'] = 0;
                action['attack'] = 1
            elif (action2_index == 2):
                action['camera'][0] = 1; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 0;  
                action['attack'] = 1
            elif (action2_index == 3):
                action['camera'][0] = -1; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 0; 
                action['attack'] = 1
            elif (action2_index == 4):
                action['camera'][0] = 0; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 0; 
                action['attack'] = 1
            elif (action2_index == 5):
                action['camera'][0] = 0; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 1; 
                action['attack'] = 0
            
                    
            action['back'] = 0
            action['left'] = 0
            action['right'] = 0

            v_preds.append(v_pred)
            observations.append(state_channel)
            actions1.append(action1_index)
            actions2.append(action2_index)
            rewards.append(reward)
            
            
            if (action2_index == 4):
                obs1, reward, done, info = env.step(action)
                net_reward += reward
                
                for k in range(0, 10):
                    if (done == False):
                        obs1, reward, done, info = env.step(action)
                        net_reward += reward
                    else:
                        break
            else:
                obs1, reward, done, info = env.step(action)
            
            reward = reward - 0.0005
            
            pov1 = obs1['pov'].astype(np.float32) / 255.0 - 0.5
            inventory1 = obs1['inventory']
            
            coal1 = inventory1['coal']
            cobblestone1 = inventory1['cobblestone']
            crafting_table1 = inventory1['crafting_table']
            dirt1 = inventory1['dirt']
            furnace1 = inventory1['furnace']
            iron_axe1 = inventory1['iron_axe']
            iron_ingot1 = inventory1['iron_ingot']
            iron_ore1 = inventory1['iron_ore']
            iron_pickaxe1 = inventory1['iron_pickaxe']
            log1 = inventory1['log']
            planks1 = inventory1['planks']
            stick1 = inventory1['stick']
            stone1 = inventory1['stone']
            stone_axe1 = inventory1['stone_axe']
            stone_pickaxe1 = inventory1['stone_pickaxe']
            torch1 = inventory1['torch']
            wooden_axe1 = inventory1['wooden_axe']
            wooden_pickaxe1 = inventory1['wooden_pickaxe']
                
            coal_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*coal1
            cobblestone_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*cobblestone1
            crafting_table_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*crafting_table1
            dirt_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*dirt1
            furnace_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*furnace1
            iron_axe_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_axe1
            iron_ingot_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_ingot1
            iron_ore_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_ore1
            iron_pickaxe_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_pickaxe1
            log_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*log1
            planks_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*planks1
            stick_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stick1
            stone_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone1
            stone_axe_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone_axe1
            stone_pickaxe_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone_pickaxe1
            torch_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*torch1
            wooden_axe_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*wooden_axe1
            wooden_pickaxe_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*wooden_pickaxe1
            
            state_channel1 = np.concatenate([pov1, coal_channel1, cobblestone_channel1, crafting_table_channel1, dirt_channel1, 
                                             furnace_channel1, iron_axe_channel1, iron_ingot_channel1, iron_ore_channel1, 
                                             iron_pickaxe_channel1, log_channel1, planks_channel1, stick_channel1, stone_channel1, 
                                             stone_axe_channel1, stone_pickaxe_channel1, torch_channel1, wooden_axe_channel1, 
                                             wooden_pickaxe_channel1], axis=-1)
            
            obs = obs1
            net_reward += reward
            
            if ( (done == True) | (j == 4499) ):
                v_preds_next = v_preds[1:] + [0]
                
                print("Total reward: ", net_reward)
                print("j: ", j)
                print("Inventory1: ", inventory1)
            
                total_reward.append(net_reward)
                #total_length.append(j)
                e = e * 0.999
                break
            
        gamma_ppo = 0.95
        deltas = [r_t + gamma_ppo * v_next - v for r_t, v_next, v in zip(rewards, v_preds_next, v_preds)]
        
        # calculate generative advantage estimator(lambda = 1), see ppo paper eq(11)
        gaes = copy.deepcopy(deltas)
        for t in reversed(range(len(gaes) - 1)):  # is T-1, where T is time step which run policy
            gaes[t] = gaes[t] + gamma_ppo * gaes[t + 1]
            
        gaes = np.array(gaes).astype(dtype=np.float32)
        gaes = (gaes - gaes.mean()) / gaes.std()
        gaes = np.squeeze(gaes)
        
        trainable_variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
        pi_trainable = [v for v in trainable_variables if v.name.split('/')[0] not in ['conv1_value','conv2_value','conv3_value','fc_value', 
                                                                                       'conv1_old','conv2_old','conv3_old', 
                                                                                       'score_1_old','score_2_old']]
        old_pi_trainable = [v for v in trainable_variables if v.name.split('/')[0] in ['conv1_old','conv2_old','conv3_old', 
                                                                                       'score_1_old','score_2_old']]
                            
        assign_ops = []
        for v_old, v in zip(old_pi_trainable, pi_trainable):
            assign_ops.append(tf.assign(v_old, v))
            
        sess.run(assign_ops)
        
        actions1 = np.array(actions1).astype(dtype=np.int32)
        actions2 = np.array(actions2).astype(dtype=np.int32)
        
        actions1 = np.squeeze(actions1)
        actions2 = np.squeeze(actions2)
        
        inp = [observations, actions1, actions2, rewards, v_preds_next, gaes]
        
        # train
        for epoch in range(8):
            sample_indices = np.random.randint(low=0, high=np.array(observations).shape[0], size=512)  # indices are in [low, high)
            sampled_inp = [np.take(a=a, indices=sample_indices, axis=0) for a in inp]  # sample training data
            _, loss1_val, loss2_val = sess.run([train_op, loss1, loss2], feed_dict={state: sampled_inp[0],
                                                                                    action1_holder: sampled_inp[1],
                                                                                    action2_holder: sampled_inp[2],
                                                                                    rewards_holder: sampled_inp[3],
                                                                                    v_preds_next_holder: sampled_inp[4],
                                                                                    gaes_holder: sampled_inp[5]})
            #print("loss1_val: " + str(loss1_val))
            #print("loss2_val: " + str(loss2_val))

Total reward:  -0.45600000000000035
j:  911
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -1.0634999999999384
j:  2126
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.3745000000000003
j:  748
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total

Total reward:  -1.0989999999999345
j:  2197
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 2, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  40.954999999995536
j:  2089
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 4, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 8, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.23000000000000018
j:  459
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total

Total reward:  82.99499999999676
j:  2009
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 2, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 16, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -1.0199999999999432
j:  2039
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -1.1239999999999317
j:  2247
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total

Total reward:  -1.094999999999935
j:  2189
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.11450000000000009
j:  228
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.2720000000000002
j:  543
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total 

Total reward:  -0.7854999999999689
j:  1570
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 9, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  28.85400000000115
j:  2291
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -1.0249999999999426
j:  2049
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total 

Total reward:  85.40149999999794
j:  1196
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 3, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 1, 'planks': 0, 'stick': 16, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.47050000000000036
j:  940
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 19, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.5179999999999984
j:  1035
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Tota

Total reward:  -0.9499999999999508
j:  1899
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 3, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -1.1099999999999333
j:  2219
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.3700000000000003
j:  739
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total

Total reward:  -0.8819999999999583
j:  1763
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.23850000000000018
j:  476
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  41.01999999999602
j:  1959
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 12, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 8, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total

Total reward:  -0.12550000000000008
j:  250
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.11950000000000009
j:  238
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.7899999999999684
j:  1579
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 5, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Tota

Total reward:  -0.6999999999999783
j:  1399
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 8, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  26.900000000001903
j:  2199
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -1.031499999999942
j:  2062
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total 

Total reward:  -0.8349999999999634
j:  1669
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 1, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  17.013500000000413
j:  1972
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -1.0999999999999344
j:  2199
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total

Total reward:  -0.8399999999999629
j:  1679
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 4, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  44.99749999999766
j:  2004
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.4265000000000003
j:  852
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total r

Total reward:  17.00500000000085
j:  1989
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  40.99049999999656
j:  2018
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 8, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  40.914999999995175
j:  2169
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 8, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total rew

Total reward:  -0.2640000000000002
j:  527
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  40.98999999999589
j:  2019
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 8, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.21650000000000016
j:  432
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total r

Total reward:  -1.0749999999999371
j:  2149
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  41.054999999995886
j:  1889
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 8, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  41.078999999996505
j:  1841
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 8, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total 

Total reward:  -0.2715000000000002
j:  542
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.22100000000000017
j:  441
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -0.7769999999999698
j:  1553
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total

Total reward:  -1.0284999999999422
j:  2056
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  21.053000000001074
j:  1893
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -1.0629999999999384
j:  2125
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total

TypeError: a bytes-like object is required, not 'NoneType'