# Simple Reinforcement Learning in MineRL
This tutorial contains a simple example of how to build a Reinforcement Learning based agent that can solve the MineRLNavigateDense-v0 environment. We use the network learned in the previous code [MineRL Imitation Learning](https://github.com/kimbring2/MineRL/blob/master/MineRL_IL_Recurrent.ipynb).

In [1]:
from __future__ import division

import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
%matplotlib inline
import matplotlib.pyplot as plt
import math

try:
    xrange = xrange
except:
    xrange = range
    
#env_name = 'MineRLNavigateDense-v0'
env_name = 'MineRLObtainIronPickaxeDense-v0'
data_path = '/media/kimbring2/6224AA7924AA5039/minerl_data'

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
gamma = 0.99

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

# Setting up our Neural Network agent
This time we will be using a Convolutional Neural Network that takes observations, passes them through a single hidden layer, and then produces a probability of choosing a Jump and Camera movement. To learn more about this network, see [Convolutional Neural Networks for Visual Recognition Course](http://cs231n.stanford.edu/).

In [3]:
H_1 = 512
H_2 = 512

tf.reset_default_graph()

state_pov_1 = tf.placeholder(shape=[None,64,64,3], dtype=tf.float32)
state_item_1 = tf.placeholder(shape=[None,18], dtype=tf.float32)
conv1_pov_1 = slim.conv2d(inputs=state_pov_1, num_outputs=3, kernel_size=[2,2], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
conv2_pov_1 = slim.conv2d(inputs=conv1_pov_1, num_outputs=3, kernel_size=[2,2], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
conv3_pov_1 = slim.conv2d(inputs=conv2_pov_1, num_outputs=3, kernel_size=[2,2], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
convFlat_pov_1 = slim.flatten(conv3_pov_1)
FC_pov_1 = slim.fully_connected(convFlat_pov_1, 256, scope='fc_pov_1')
FC_item_1 = slim.fully_connected(state_item_1, 256, scope='fc_item_1')

#print("FC_pov_1: " + str(FC_pov_1))
#print("FC_item_1: " + str(FC_item_1))

convFlat_1 = tf.concat([FC_pov_1, FC_item_1], 1)
#print("convFlat_1: " + str(convFlat_1))

state_pov_2 = tf.placeholder(shape=[None,64,64,3], dtype=tf.float32)
state_item_2 = tf.placeholder(shape=[None,18], dtype=tf.float32)
conv1_pov_2 = slim.conv2d(inputs=state_pov_2, num_outputs=3, kernel_size=[2,2], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
conv2_pov_2 = slim.conv2d(inputs=conv1_pov_2, num_outputs=3, kernel_size=[2,2], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
conv3_pov_2 = slim.conv2d(inputs=conv2_pov_2, num_outputs=3, kernel_size=[2,2], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
convFlat_pov_2 = slim.flatten(conv3_pov_2)
FC_pov_2 = slim.fully_connected(convFlat_pov_2, 256)
FC_item_2 = slim.fully_connected(state_item_2, 256)

convFlat_2 = tf.concat([FC_pov_2, FC_item_2], 1)

W_1 = tf.get_variable("W_1", shape=[H_1,21],
            initializer=tf.contrib.layers.xavier_initializer())
score_1 = tf.matmul(convFlat_1, W_1)
probability_1 = tf.nn.softmax(score_1)

W_2 = tf.get_variable("W_2", shape=[H_2,20],
            initializer=tf.contrib.layers.xavier_initializer())
score_2 = tf.matmul(convFlat_2, W_2)
probability_2 = tf.nn.softmax(score_2)

reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
action1_holder = tf.placeholder(shape=[None],dtype=tf.int32)
action2_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        
indexes_1 = tf.range(0, tf.shape(probability_1)[0]) * tf.shape(probability_1)[1] + action1_holder
responsible_outputs_1 = tf.gather(tf.reshape(probability_1, [-1]), indexes_1)

indexes_2 = tf.range(0, tf.shape(probability_2)[0]) * tf.shape(probability_2)[1] + action2_holder
responsible_outputs_2 = tf.gather(tf.reshape(probability_2, [-1]), indexes_2)

loss1 = -tf.reduce_mean(tf.log(responsible_outputs_1) * reward_holder)
loss2 = -tf.reduce_mean(tf.log(responsible_outputs_2) * reward_holder)
loss = loss1 + loss2
    
tvars = tf.trainable_variables()
gradient_holders = []
for idx, var in enumerate(tvars):
    placeholder = tf.placeholder(tf.float32, name=str(idx) + '_holder')
    gradient_holders.append(placeholder)
        
gradients = tf.gradients(loss, tvars)
        
optimizer = tf.train.AdamOptimizer(learning_rate=0.00001)
update_batch = optimizer.apply_gradients(zip(gradient_holders,tvars))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


# Train

In [None]:
import minerl
import gym
import os
import random
from env_wrappers import ContinuingTimeLimitMonitor

env = gym.make(env_name)
#env = ContinuingTimeLimitMonitor(env, os.path.join('/home/kimbring2/MineRL/', 'monitor'), mode='evaluation', 
#                                 video_callable=lambda episode_id: True)

e = 0.1
update_frequency = 100

init = tf.global_variables_initializer()
with tf.Session() as sess:
# Launch the graph
    sess.run(init)
    saver = tf.train.Saver(max_to_keep=5)
    
    #print('Loading Model...') 
    #path = '/home/kimbring2/MineRL_Git/model/' + env_name
    #ckpt = tf.train.get_checkpoint_state(path)
    #saver.restore(sess, ckpt.model_checkpoint_path)
    
    for i in range(5000):
        total_reward = []
        total_length = []
    
        gradBuffer = sess.run(tf.trainable_variables())
        for ix, grad in enumerate(gradBuffer):
            gradBuffer[ix] = grad * 0
    
        env.init()
        obs = env.reset()
        net_reward = 0
        ep_history = []
        while True:
            pov = obs['pov'].astype(np.float32) / 255.0 - 0.5
            inventory = obs['inventory']
                
            coal = inventory['coal']
            cobblestone = inventory['cobblestone']
            crafting_table = inventory['crafting_table']
            dirt = inventory['dirt']
            furnace = inventory['furnace']
            iron_axe = inventory['iron_axe']
            iron_ingot = inventory['iron_ingot']
            iron_ore = inventory['iron_ore']
            iron_pickaxe = inventory['iron_pickaxe']
            log = inventory['log']
            #print("log: " + str(log))
            planks = inventory['planks']
            stick = inventory['stick']
            stone = inventory['stone']
            stone_axe = inventory['stone_axe']
            stone_pickaxe = inventory['stone_pickaxe']
            torch = inventory['torch']
            wooden_axe = inventory['wooden_axe']
            wooden_pickaxe = inventory['wooden_pickaxe']
                
            state_pov = pov
            state_item = np.array([coal, cobblestone, crafting_table, dirt, furnace, iron_axe, iron_ingot,
                                   iron_ore, iron_pickaxe, log, planks, stick, stone, stone_axe, stone_pickaxe,
                                   torch, wooden_axe, wooden_pickaxe])
            
            action1_probability = sess.run(probability_1, feed_dict={state_pov_1:[state_pov],
                                                                     state_item_1:[state_item],
                                                                     state_pov_2:[state_pov],
                                                                     state_item_2:[state_item]})
            if np.random.rand(1) >= e:
                action1_index = np.argmax(action1_probability)
            else:  
                action1_index = random.randint(0,21)
            
            if (log < 5):
                action1_index = 0
            
            action = env.action_space.noop()
            if (action1_index == 0):
                action['attack'] = 1; action['place'] = 'none'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
                
            elif (action1_index == 1):
                action['attack'] = 0; action['place'] = 'dirt'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
            elif (action1_index == 2):
                action['attack'] = 0; action['place'] = 'stone'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
            elif (action1_index == 3):
                action['attack'] = 0; action['place'] = 'cobblestone'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
            elif (action1_index == 4):
                action['attack'] = 0; action['place'] = 'crafting_table'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
            elif (action1_index == 5):
                action['attack'] = 0; action['place'] = 'furnace'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
            elif (action1_index == 6):
                action['attack'] = 0; action['place'] = 'torch'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
                
            elif (action1_index == 7):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'torch'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
            elif (action1_index == 8):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'stick'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
            elif (action1_index == 9):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'planks'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
            elif (action1_index == 10):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'crafting_table'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'
                
            elif (action1_index == 11):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'wooden_axe'; action['nearbySmelt'] = 'none'
            elif (action1_index == 12):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'wooden_pickaxe'; action['nearbySmelt'] = 'none'
            elif (action1_index == 13):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'stone_axe'; action['nearbySmelt'] = 'none'
            elif (action1_index == 14):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'stone_pickaxe'; action['nearbySmelt'] = 'none'
            elif (action1_index == 15):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'iron_axe'; action['nearbySmelt'] = 'none'
            elif (action1_index == 16):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'iron_pickaxe'; action['nearbySmelt'] = 'none'
            elif (action1_index == 17):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'furnace'; action['nearbySmelt'] = 'none'
                
            elif (action1_index == 18):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'iron_ingot'
            elif (action1_index == 19):
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'coal'
            else:
                action['attack'] = 0; action['place'] = 'none'; action['craft'] = 'none'; 
                action['nearbyCraft'] = 'none'; action['nearbySmelt'] = 'none'

            action2_probability = sess.run(probability_2, feed_dict={state_pov_1:[state_pov],
                                                                     state_item_1:[state_item],
                                                                     state_pov_2:[state_pov],
                                                                     state_item_2:[state_item]})
            if np.random.rand(1) >= e:
                action2_index = np.argmax(action2_probability)   
            else:    
                action2_index = random.randint(0,20)
        
            if (action2_index == 0):
                action['camera'][0] = 0; action['camera'][1] = -5; action['forward'] = 0; action['jump'] = 0; 
            elif (action2_index == 1):
                action['camera'][0] = 0; action['camera'][1] = -5; action['forward'] = 0; action['jump'] = 1; 
            elif (action2_index == 2):
                action['camera'][0] = 0; action['camera'][1] = -5; action['forward'] = 1; action['jump'] = 0; 
            elif (action2_index == 3):
                action['camera'][0] = 0; action['camera'][1] = -5; action['forward'] = 1; action['jump'] = 1; 
                
            elif (action2_index == 4):
                action['camera'][0] = 0; action['camera'][1] = 5; action['forward'] = 0; action['jump'] = 0; 
            elif (action2_index == 5):
                action['camera'][0] = 0; action['camera'][1] = 5; action['forward'] = 0; action['jump'] = 1; 
            elif (action2_index == 6):
                action['camera'][0] = 0; action['camera'][1] = 5; action['forward'] = 1; action['jump'] = 0; 
            elif (action2_index == 7):
                action['camera'][0] = 0; action['camera'][1] = 5; action['forward'] = 1; action['jump'] = 1;  
                
            elif (action2_index == 8):
                action['camera'][0] = 5; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 0; 
            elif (action2_index == 9):
                action['camera'][0] = 5; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 1; 
            elif (action2_index == 10):
                action['camera'][0] = 5; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 0; 
            elif (action2_index == 11):
                action['camera'][0] = 5; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 1;              
                
            elif (action2_index == 12):
                action['camera'][0] = -5; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 0; 
            elif (action2_index == 13):
                action['camera'][0] = -5; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 1; 
            elif (action2_index == 14):
                action['camera'][0] = -5; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 0; 
            elif (action2_index == 15):
                action['camera'][0] = -5; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 1;  
                
            elif (action2_index == 16):
                action['camera'][0] = 0; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 0; 
            elif (action2_index == 17):
                action['camera'][0] = 0; action['camera'][1] = 0; action['forward'] = 0; action['jump'] = 1; 
            elif (action2_index == 18):
                action['camera'][0] = 0; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 0; 
            elif (action2_index == 19):
                action['camera'][0] = 0; action['camera'][1] = 0; action['forward'] = 1; action['jump'] = 1;          
        
            #if (planks < 5):
            #    action['forward'] = 1
        
            action['back'] = 0
            action['left'] = 0
            action['right'] = 0

            obs1, reward, done, info = env.step(action)
            
            pov1 = obs1['pov'].astype(np.float32) / 255.0 - 0.5
            inventory1 = obs1['inventory']
            
            coal1 = inventory1['coal']
            cobblestone1 = inventory1['cobblestone']
            crafting_table1 = inventory1['crafting_table']
            dirt1 = inventory1['dirt']
            furnace1 = inventory1['furnace']
            iron_axe1 = inventory1['iron_axe']
            iron_ingot1 = inventory1['iron_ingot']
            iron_ore1 = inventory1['iron_ore']
            iron_pickaxe1 = inventory1['iron_pickaxe']
            log1 = inventory1['log']
            planks1 = inventory1['planks']
            stick1 = inventory1['stick']
            stone1 = inventory1['stone']
            stone_axe1 = inventory1['stone_axe']
            stone_pickaxe1 = inventory1['stone_pickaxe']
            torch1 = inventory1['torch']
            wooden_axe1 = inventory1['wooden_axe']
            wooden_pickaxe1 = inventory1['wooden_pickaxe']
                
            state_pov1 = pov1
            state_item1 = np.array([coal1, cobblestone1, crafting_table1, dirt1, furnace1, iron_axe1, iron_ingot1,
                                   iron_ore1, iron_pickaxe1, log1, planks1, stick1, stone1, stone_axe1, stone_pickaxe1,
                                   torch1, wooden_axe1, wooden_pickaxe1])
            
            ep_history.append([state_pov, state_item, action1_index, action2_index, reward, 
                               state_pov1, state_item1])
            obs = obs1
            net_reward += reward
        
            if done == True:
                print("Total reward: ", net_reward)
                print("e: ", e)
                
                #Update the network.
                ep_history = np.array(ep_history)
                ep_history[:,4] = discount_rewards(ep_history[:,4])
            
                #print("ep_history[:,0]: " + str(ep_history[:,0]))
                feed_dict={reward_holder:ep_history[:,4],
                           action1_holder:ep_history[:,2], action2_holder:ep_history[:,3], 
                           state_pov_1:np.stack(ep_history[:,0], 0),
                           state_item_1:np.stack(ep_history[:,1], 0),
                           state_pov_2:np.stack(ep_history[:,5], 0),
                           state_item_2:np.stack(ep_history[:,6], 0)
                          }
                grads = sess.run(gradients, feed_dict=feed_dict)
                for idx, grad in enumerate(grads):
                    gradBuffer[idx] += grad

                #if i % update_frequency == 0 and i != 0:
                feed_dict = dictionary = dict(zip(gradient_holders,gradBuffer))
                _ = sess.run(update_batch, feed_dict=feed_dict)
                for ix, grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0
                
                model_path = '/home/kimbring2/MineRL_Git/model/' + env_name
                saver.save(sess, model_path + '/model-' + str(i) + '.cptk')
                print("Saved Model")
                print("")
                
                total_reward.append(net_reward)
                #total_length.append(j)
                e = e * 0.99
                break

Total reward:  0.0
e:  0.1
Saved Model

Total reward:  0.0
e:  0.099
Saved Model

