# Simple Reinforcement Learning in MineRL
This tutorial contains a simple example of how to build a Reinforcement Learning based agent that can solve the MineRLNavigateDense-v0 environment. We use the network learned in the previous code [MineRL Imitation Learning](https://github.com/kimbring2/MineRL/blob/master/MineRL_IL_Recurrent.ipynb).

In [1]:
from __future__ import division

import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
%matplotlib inline
import matplotlib.pyplot as plt
import math

try:
    xrange = xrange
except:
    xrange = range
    
#env_name = 'MineRLNavigateDense-v0'
env_name = 'MineRLObtainIronPickaxe-v0'
data_path = '/media/kimbring2/6224AA7924AA5039/minerl_data'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
gamma = 0.99

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

# Setting up our Neural Network agent
This time we will be using a Convolutional Neural Network that takes observations, passes them through a single hidden layer, and then produces a probability of choosing a Jump and Camera movement. To learn more about this network, see [Convolutional Neural Networks for Visual Recognition Course](http://cs231n.stanford.edu/).

In [3]:
H = 1024

tf.reset_default_graph()

state = tf.placeholder(shape=[None,64,64,21], dtype=tf.float32)
conv1 = slim.conv2d( \
            inputs=state,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', 
                    biases_initializer=None,activation_fn=tf.nn.relu)
conv2 = slim.conv2d( \
            inputs=conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', 
                    biases_initializer=None,activation_fn=tf.nn.relu)
conv3 = slim.conv2d( \
            inputs=conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', 
                    biases_initializer=None,activation_fn=tf.nn.relu)
convFlat = slim.flatten(conv3)

score_1 = slim.fully_connected(convFlat, 17, activation_fn=None, biases_initializer=None)
probability_1 = tf.nn.softmax(score_1)

score_2 = slim.fully_connected(convFlat, 5, activation_fn=None, biases_initializer=None)
probability_2 = tf.nn.softmax(score_2)

score_3 = slim.fully_connected(convFlat, 4, activation_fn=None, biases_initializer=None)
probability_3 = tf.nn.softmax(score_3)

score_4 = slim.fully_connected(convFlat, 2, activation_fn=None, biases_initializer=None)
probability_4 = tf.nn.softmax(score_4)

reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
action1_holder = tf.placeholder(shape=[None],dtype=tf.int32)
action2_holder = tf.placeholder(shape=[None],dtype=tf.int32)
action3_holder = tf.placeholder(shape=[None],dtype=tf.int32)
action4_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        
indexes_1 = tf.range(0, tf.shape(probability_1)[0]) * tf.shape(probability_1)[1] + action1_holder
responsible_outputs_1 = tf.gather(tf.reshape(probability_1, [-1]), indexes_1)

indexes_2 = tf.range(0, tf.shape(probability_2)[0]) * tf.shape(probability_2)[1] + action2_holder
responsible_outputs_2 = tf.gather(tf.reshape(probability_2, [-1]), indexes_2)

indexes_3 = tf.range(0, tf.shape(probability_3)[0]) * tf.shape(probability_3)[1] + action3_holder
responsible_outputs_3 = tf.gather(tf.reshape(probability_3, [-1]), indexes_3)

indexes_4 = tf.range(0, tf.shape(probability_4)[0]) * tf.shape(probability_4)[1] + action4_holder
responsible_outputs_4 = tf.gather(tf.reshape(probability_4, [-1]), indexes_4)

loss1 = -tf.reduce_mean(tf.log(responsible_outputs_1) * reward_holder)
loss2 = -tf.reduce_mean(tf.log(responsible_outputs_2) * reward_holder)
loss3 = -tf.reduce_mean(tf.log(responsible_outputs_3) * reward_holder)
loss4 = -tf.reduce_mean(tf.log(responsible_outputs_4) * reward_holder)
loss = loss1 + loss2 + loss3 + loss4
    
tvars = tf.trainable_variables()
gradient_holders = []
for idx, var in enumerate(tvars):
    placeholder = tf.placeholder(tf.float32, name=str(idx) + '_holder')
    gradient_holders.append(placeholder)
        
gradients = tf.gradients(loss, tvars)
        
optimizer = tf.train.AdamOptimizer(learning_rate=0.0005)
update_batch = optimizer.apply_gradients(zip(gradient_holders,tvars))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


# Train

In [None]:
import minerl
import gym
import os
import random
from env_wrappers import ContinuingTimeLimitMonitor

env = gym.make(env_name)
#env = ContinuingTimeLimitMonitor(env, os.path.join('/home/kimbring2/MineRL/', 'monitor'), mode='evaluation', 
#                                 video_callable=lambda episode_id: True)

e = 0.01
update_frequency = 5

init = tf.global_variables_initializer()
with tf.Session() as sess:
    sess.run(init)
    saver = tf.train.Saver(max_to_keep=5)
    
    print('Loading Model...') 
    path = '/home/kimbring2/MineRL_Git/model/' + env_name
    ckpt = tf.train.get_checkpoint_state(path)
    saver.restore(sess, ckpt.model_checkpoint_path)
    
    gradBuffer = sess.run(tf.trainable_variables())
    for ix, grad in enumerate(gradBuffer):
        gradBuffer[ix] = grad * 0
    
    for i in range(0, 5000):
        total_reward = []
        total_length = []
    
        env.init()
        obs = env.reset()
        net_reward = 0
        ep_history = []
        for j in range(0, 3000):
            #print("current_state['inventory']: " + str(current_state['inventory']))
            pov = obs['pov'].astype(np.float32) / 255.0 - 0.5
            inventory = obs['inventory']
            #print("compass: " + str(compass))
        
            coal = inventory['coal']
            cobblestone = inventory['cobblestone']
            crafting_table = inventory['crafting_table']
            dirt = inventory['dirt']
            furnace = inventory['furnace']
            iron_axe = inventory['iron_axe']
            iron_ingot = inventory['iron_ingot']
            iron_ore = inventory['iron_ore']
            iron_pickaxe = inventory['iron_pickaxe']
            log = inventory['log']
            planks = inventory['planks']
            stick = inventory['stick']
            stone = inventory['stone']
            stone_axe = inventory['stone_axe']
            stone_pickaxe = inventory['stone_pickaxe']
            torch = inventory['torch']
            wooden_axe = inventory['wooden_axe']
            wooden_pickaxe = inventory['wooden_pickaxe']
          
            coal_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*coal
            cobblestone_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*cobblestone
            crafting_table_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*crafting_table
            dirt_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*dirt
            furnace_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*furnace
            iron_axe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_axe
            iron_ingot_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_ingot
            iron_ore_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_ore
            iron_pickaxe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_pickaxe
            log_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*log
            planks_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*planks
            stick_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stick
            stone_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone
            stone_axe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone_axe
            stone_pickaxe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone_pickaxe
            torch_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*torch
            wooden_axe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*wooden_axe
            wooden_pickaxe_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*wooden_pickaxe
            
            state_channel = np.concatenate([pov, coal_channel, cobblestone_channel, crafting_table_channel, dirt_channel, furnace_channel, 
                                            iron_axe_channel, iron_ingot_channel, iron_ore_channel, iron_pickaxe_channel, log_channel, 
                                            planks_channel, stick_channel, stone_channel, stone_axe_channel, stone_pickaxe_channel,
                                            torch_channel, wooden_axe_channel, wooden_pickaxe_channel], axis=-1)
        
            action = env.action_space.noop()
            action1_probability, action2_probability, action3_probability, action4_probability = sess.run([probability_1, 
                                                                                                           probability_2, 
                                                                                                           probability_3,
                                                                                                           probability_4], 
                                                                                                           feed_dict={state:[state_channel]})
        
        
            #print("action4_probability: " + str(action4_probability))
            if np.random.rand(1) >= e:
                action4_index = np.argmax(action4_probability)
            else:  
                action4_index = random.randint(0,1)
        
            #print("action4_index: " + str(action4_index))
            if (action4_index == 0):
                action['attack'] = 0
            else:
                action['attack'] = 1
        
            if np.random.rand(1) >= e:
                action1_index = np.argmax(action1_probability)
            else:  
                action1_index = random.randint(0,20)
         
            if (action1_index == 0):
                action['place'] = 1; action['craft'] = 0; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 1):
                action['place'] = 2; action['craft'] = 0; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 2):
                action['place'] = 3; action['craft'] = 0; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 3):
                action['place'] = 4; action['craft'] = 0; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 4):
                action['place'] = 5; action['craft'] = 0; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
                
            elif (action1_index == 5):
                action['place'] = 0; action['craft'] = 1; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 6):
                action['place'] = 0; action['craft'] = 2; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 7):
                action['place'] = 0; action['craft'] = 3; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            elif (action1_index == 8):
                action['place'] = 0; action['craft'] = 4; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
                
            elif (action1_index == 9):
                action['place'] = 0; action['craft'] = 0; 
                action['nearbyCraft'] = 1; action['nearbySmelt'] = 0
            elif (action1_index == 10):
                action['place'] = 'none'; action['craft'] = 0; 
                action['nearbyCraft'] = 2; action['nearbySmelt'] = 0
            elif (action1_index == 11):
                action['place'] = 0; action['craft'] = 0; 
                action['nearbyCraft'] = 3; action['nearbySmelt'] = 0
            elif (action1_index == 12):
                action['place'] = 0; action['craft'] = 0; 
                action['nearbyCraft'] = 4; action['nearbySmelt'] = 0
            elif (action1_index == 13):
                action['place'] = 0; action['craft'] = 0; 
                action['nearbyCraft'] = 5; action['nearbySmelt'] = 0
            elif (action1_index == 14):
                action['place'] = 0; action['craft'] = 0; 
                action['nearbyCraft'] = 6; action['nearbySmelt'] = 0
            elif (action1_index == 15):
                action['place'] = 0; action['craft'] = 0; 
                action['nearbyCraft'] = 7; action['nearbySmelt'] = 0
                
            elif (action1_index == 16):
                action['place'] = 0; action['craft'] = 0; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 1
            elif (action1_index == 17):
                action['attack'] = 0; action['place'] = 0; action['craft'] = 0; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 2
            elif (action1_index == 18):
                action['attack'] = 0; action['place'] = 0; action['craft'] = 0; 
                action['nearbyCraft'] = 0; action['nearbySmelt'] = 0
            
            if np.random.rand(1) >= e:
                action2_index = np.argmax(action2_probability)   
            else:    
                action2_index = random.randint(0,5)
        
            if (action2_index == 0):
                action['camera'][0] = 0; action['camera'][1] = -5;  
            elif (action2_index == 1):
                action['camera'][0] = 0; action['camera'][1] = 5; 
            elif (action2_index == 2):
                action['camera'][0] = 5; action['camera'][1] = 0; 
            elif (action2_index == 3):
                action['camera'][0] = -5; action['camera'][1] = 0;        
            elif (action2_index == 4):
                action['camera'][0] = 0; action['camera'][1] = 0;
            
            
            if np.random.rand(1) >= e:
                action3_index = np.argmax(action3_probability)   
            else:    
                action3_index = random.randint(0,4)
            
            if (action3_index == 0):
                action['forward'] = 0; action['jump'] = 0; 
            elif (action3_index == 1):
                action['forward'] = 1; action['jump'] = 0; 
            elif (action3_index == 2):
                action['forward'] = 0; action['jump'] = 1;        
            elif (action3_index == 3):
                action['forward'] = 1; action['jump'] = 1;            
        
            #if (planks < 5):
            #    action['forward'] = 1
        
            action['back'] = 0
            action['left'] = 0
            action['right'] = 0

            obs1, reward, done, info = env.step(action)
            
            pov1 = obs1['pov'].astype(np.float32) / 255.0 - 0.5
            inventory1 = obs1['inventory']
            
            coal1 = inventory1['coal']
            cobblestone1 = inventory1['cobblestone']
            crafting_table1 = inventory1['crafting_table']
            dirt1 = inventory1['dirt']
            furnace1 = inventory1['furnace']
            iron_axe1 = inventory1['iron_axe']
            iron_ingot1 = inventory1['iron_ingot']
            iron_ore1 = inventory1['iron_ore']
            iron_pickaxe1 = inventory1['iron_pickaxe']
            log1 = inventory1['log']
            planks1 = inventory1['planks']
            stick1 = inventory1['stick']
            stone1 = inventory1['stone']
            stone_axe1 = inventory1['stone_axe']
            stone_pickaxe1 = inventory1['stone_pickaxe']
            torch1 = inventory1['torch']
            wooden_axe1 = inventory1['wooden_axe']
            wooden_pickaxe1 = inventory1['wooden_pickaxe']
                
            coal_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*coal1
            cobblestone_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*cobblestone1
            crafting_table_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*crafting_table1
            dirt_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*dirt1
            furnace_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*furnace1
            iron_axe_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_axe1
            iron_ingot_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_ingot1
            iron_ore_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_ore1
            iron_pickaxe_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*iron_pickaxe1
            log_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*log1
            planks_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*planks1
            stick_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stick1
            stone_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone1
            stone_axe_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone_axe1
            stone_pickaxe_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*stone_pickaxe1
            torch_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*torch1
            wooden_axe_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*wooden_axe1
            wooden_pickaxe_channel1 = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32)*wooden_pickaxe1
            
            state_channel1 = np.concatenate([pov, coal_channel1, cobblestone_channel1, crafting_table_channel1, dirt_channel1, 
                                              furnace_channel1, iron_axe_channel1, iron_ingot_channel1, iron_ore_channel1, 
                                              iron_pickaxe_channel1, log_channel1, planks_channel1, stick_channel1, stone_channel1, 
                                              stone_axe_channel1, stone_pickaxe_channel1, torch_channel1, wooden_axe_channel1, 
                                              wooden_pickaxe_channel1], axis=-1)
            
            ep_history.append([state_channel, action1_index, action2_index, action3_index, action4_index, reward, 
                               state_channel1])
            obs = obs1
            net_reward += reward
            
            
            if ( (done == True) | (j == 2999) ):
                if (net_reward == 0):
                    net_reward = -5
                    
                print("Total reward: ", net_reward)
                print("j: ", j)
                print("Inventory1: ", inventory1)
                
                #Update the network.
                ep_history = np.array(ep_history)
                ep_history[:,5] = discount_rewards(ep_history[:,5])
            
                feed_dict={reward_holder:ep_history[:,5],
                           action1_holder:ep_history[:,1], action2_holder:ep_history[:,2], 
                           action3_holder:ep_history[:,3], action4_holder:ep_history[:,4], 
                           state:np.stack(ep_history[:,0], 0)
                          }
                grads = sess.run(gradients, feed_dict=feed_dict)
                for idx, grad in enumerate(grads):
                    gradBuffer[idx] += grad

                if i % update_frequency == 0 and i != 0:
                    feed_dict = dictionary = dict(zip(gradient_holders,gradBuffer))
                    _ = sess.run(update_batch, feed_dict=feed_dict)
                    for ix, grad in enumerate(gradBuffer):
                        gradBuffer[ix] = grad * 0
                
                #model_path = '/home/kimbring2/MineRL_Git/model/' + env_name
                #saver.save(sess, model_path + '/model-' + str(i) + '.cptk')
                #print("Saved Model")
                #print("")
                
                total_reward.append(net_reward)
                #total_length.append(j)
                e = e * 0.999
                break

Loading Model...
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from /home/kimbring2/MineRL/model/MineRLObtainIronPickaxe-v0/model-4900.cptk
Total reward:  -5
j:  694
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -5
j:  2999
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 2, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -5
j:  2999
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron

Total reward:  14.0
j:  2999
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 1, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -5
j:  2999
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 0, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -5
j:  2999
Inventory1:  {'coal': 0, 'cobblestone': 0, 'crafting_table': 0, 'dirt': 1, 'furnace': 0, 'iron_axe': 0, 'iron_ingot': 0, 'iron_ore': 0, 'iron_pickaxe': 0, 'log': 0, 'planks': 0, 'stick': 0, 'stone': 0, 'stone_axe': 0, 'stone_pickaxe': 0, 'torch': 0, 'wooden_axe': 0, 'wooden_pickaxe': 0}
Total reward:  -5
j:  2999
Inventory1:  {'coal': 0, '