# Simple Reinforcement Learning in MineRL
This tutorial contains a simple example of how to build a Reinforcement Learning based agent that can solve the MineRLNavigateDense-v0 environment. We use the network learned in the previous code [MineRL Imitation Learning](https://github.com/kimbring2/MineRL/blob/master/MineRL_IL_Recurrent.ipynb).

In [2]:
from __future__ import division

import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
%matplotlib inline
import matplotlib.pyplot as plt
import math

try:
    xrange = xrange
except:
    xrange = range
    
#env_name = 'MineRLNavigateDense-v0'
env_name = 'MineRLTreechop-v0'
data_path = '/media/kimbring2/6224AA7924AA5039/minerl_data'

In [3]:
gamma = 0.99

def discount_rewards(r):
    """ take 1D float array of rewards and compute discounted reward """
    discounted_r = np.zeros_like(r)
    running_add = 0
    for t in reversed(xrange(0, r.size)):
        running_add = running_add * gamma + r[t]
        discounted_r[t] = running_add
    return discounted_r

# Setting up our Neural Network agent
This time we will be using a Convolutional Neural Network that takes observations, passes them through a single hidden layer, and then produces a probability of choosing a Jump and Camera movement. To learn more about this network, see [Convolutional Neural Networks for Visual Recognition Course](http://cs231n.stanford.edu/).

In [4]:
H = 1024

tf.reset_default_graph()

if (env_name == 'MineRLTreechop-v0'):
    state = tf.placeholder(shape=[None,64,64,3], dtype=tf.float32)
elif (env_name == 'MineRLNavigateDense-v0'):
    state = tf.placeholder(shape=[None,64,64,4], dtype=tf.float32)

conv1 = slim.conv2d( \
            inputs=state,num_outputs=32,kernel_size=[8,8],stride=[4,4],padding='VALID', 
                    biases_initializer=None,activation_fn=tf.nn.relu)
conv2 = slim.conv2d( \
            inputs=conv1,num_outputs=64,kernel_size=[4,4],stride=[2,2],padding='VALID', 
                    biases_initializer=None,activation_fn=tf.nn.relu)
conv3 = slim.conv2d( \
            inputs=conv2,num_outputs=64,kernel_size=[3,3],stride=[1,1],padding='VALID', 
                    biases_initializer=None,activation_fn=tf.nn.relu)

convFlat = slim.flatten(conv3)
#print("convFlat: " + str(convFlat))

if (env_name == 'MineRLTreechop-v0'):
    W = tf.get_variable("W", shape=[H,6],
               initializer=tf.contrib.layers.xavier_initializer())
    score = tf.matmul(convFlat, W)
    probability = tf.nn.softmax(score)
    #real_action = tf.placeholder(shape=[None,6], dtype=tf.int32)
elif (env_name == 'MineRLNavigateDense-v0'):
    W = tf.get_variable("W", shape=[H,4],
               initializer=tf.contrib.layers.xavier_initializer())
    score = tf.matmul(convFlat, W)
    probability = tf.nn.softmax(score)
    #real_action = tf.placeholder(shape=[None,4], dtype=tf.int32)

reward_holder = tf.placeholder(shape=[None],dtype=tf.float32)
action_holder = tf.placeholder(shape=[None],dtype=tf.int32)
        
indexes = tf.range(0, tf.shape(probability)[0]) * tf.shape(probability)[1] + action_holder
responsible_outputs = tf.gather(tf.reshape(probability, [-1]), indexes)

loss = -tf.reduce_mean(tf.log(responsible_outputs) * reward_holder)
    
#loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=real_action, 
#                                                                     logits=score))
#tf.summary.scalar('loss', loss)

tvars = tf.trainable_variables()
gradient_holders = []
for idx, var in enumerate(tvars):
    placeholder = tf.placeholder(tf.float32, name=str(idx) + '_holder')
    gradient_holders.append(placeholder)
        
gradients = tf.gradients(loss, tvars)
        
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
update_batch = optimizer.apply_gradients(zip(gradient_holders,tvars))

#train_step = tf.train.AdamOptimizer(0.001).minimize(loss)

#merged = tf.summary.merge_all()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use keras.layers.flatten instead.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [4]:
'''
    (noop) : 'MineRLNavigateDense-v0'
    {'jump': 0, 'camera': [0, 0]} = 3
    {'jump': 1, 'camera': [0, 0]} = 4
    {'jump': 0, 'camera': [0, -10]} = 1
    {'jump': 0, 'camera': [0, 10]} = 2
            
    (noop) : 'MineRLTreechop-v0'
    {'forward': 0, 'jump': 0, 'camera': [0, 0]} = 5
    {'forward': 1, 'jump': 1, 'camera': [0, 0]} = 6
    {'forward': 1, 'jump': 0, 'camera': [0, -10]} = 1
    {'forward': 1, 'jump': 0, 'camera': [0, 10]} = 2
    {'forward': 1, 'jump': 0, 'camera': [-10, 0]} = 3
    {'forward': 1, 'jump': 0, 'camera': [10, 0]} = 4
''' 

"\n    (noop) : 'MineRLNavigateDense-v0'\n    {'jump': 0, 'camera': [0, 0]} = 3\n    {'jump': 1, 'camera': [0, 0]} = 4\n    {'jump': 0, 'camera': [0, -10]} = 1\n    {'jump': 0, 'camera': [0, 10]} = 2\n            \n    (noop) : 'MineRLTreechop-v0'\n    {'forward': 0, 'jump': 0, 'camera': [0, 0]} = 5\n    {'forward': 1, 'jump': 1, 'camera': [0, 0]} = 6\n    {'forward': 1, 'jump': 0, 'camera': [0, -10]} = 1\n    {'forward': 1, 'jump': 0, 'camera': [0, 10]} = 2\n    {'forward': 1, 'jump': 0, 'camera': [-10, 0]} = 3\n    {'forward': 1, 'jump': 0, 'camera': [10, 0]} = 4\n"

# Train

In [None]:
import minerl
import gym
import os
import random
from env_wrappers import ContinuingTimeLimitMonitor

env = gym.make(env_name)
#env = ContinuingTimeLimitMonitor(env, os.path.join('/home/kimbring2/MineRL/', 'monitor'), mode='evaluation', 
#                                 video_callable=lambda episode_id: True)

e = 0.05
update_frequency = 100

init = tf.global_variables_initializer()
with tf.Session() as sess:
# Launch the graph
    sess.run(init)
    saver = tf.train.Saver(max_to_keep=5)
    
    print('Loading Model...') 
    path = '/home/kimbring2/MineRL_Git/model/' + env_name
    ckpt = tf.train.get_checkpoint_state(path)
    saver.restore(sess, ckpt.model_checkpoint_path)
    
    for i in range(5000):
        total_reward = []
        total_length = []
    
        gradBuffer = sess.run(tf.trainable_variables())
        for ix, grad in enumerate(gradBuffer):
            gradBuffer[ix] = grad * 0
    
        env.init()
        obs = env.reset()
        net_reward = 0
        ep_history = []
        while True:
            if (env_name == 'MineRLTreechop-v0'):
                state_concat = obs['pov'].astype(np.float32) / 255.0 - 0.5
            elif (env_name == 'MineRLNavigateDense-v0'):
                pov = obs['pov'].astype(np.float32) / 255.0 - 0.5
                compass = obs['compassAngle']
                compass_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32) * compass
                compass_channel /= 180.0
                state_concat = np.concatenate([pov, compass_channel], axis=-1)
            
            action_probability = sess.run(probability, feed_dict={state:[state_concat]})
            if np.random.rand(1) >= e:
                action_index = np.argmax(action_probability)
            else:
                if (env_name == 'MineRLNavigateDense-v0'):
                    action_index = random.randint(0,4)
                elif (env_name == 'MineRLTreechop-v0'):
                    action_index = random.randint(0,6)
        
            action = env.action_space.noop()
            if (env_name == 'MineRLNavigateDense-v0'):
                if (action_index == 0):
                    action['camera'] = [0, -10]
                    action['jump'] = 0
                    action['forward'] = 1
                    action['sprint'] = 1
                elif (action_index == 1):
                    action['camera'] = [0, 10]
                    action['jump'] = 0
                    action['forward'] = 1
                    action['sprint'] = 1
                elif (action_index == 2):
                    action['camera'] = [0, 0]
                    action['jump'] = 0
                    action['forward'] = 1
                    action['sprint'] = 1
                else:
                    action['camera'] = [0, 0]
                    action['jump'] = 1
                    action['forward'] = 1
                    action['sprint'] = 1
            elif (env_name == 'MineRLTreechop-v0'):
                if (action_index == 0):
                    action['camera'] = [0, -10]
                    action['jump'] = 0
                    action['forward'] = 1
                    action['attack'] = 1
                    action['sprint'] = 0
                elif (action_index == 1):
                    action['camera'] = [0, 10]
                    action['jump'] = 0
                    action['forward'] = 1
                    action['attack'] = 1
                    action['sprint'] = 0
                elif (action_index == 2):
                    action['camera'] = [-10, 0]
                    action['jump'] = 0
                    action['forward'] = 1
                    action['attack'] = 1
                    action['sprint'] = 0
                elif (action_index == 3):
                    action['camera'] = [10, 0]
                    action['jump'] = 0
                    action['forward'] = 1
                    action['attack'] = 1
                    action['sprint'] = 0
                elif (action_index == 4):
                    action['camera'] = [0, 0]
                    action['jump'] = 0
                    action['forward'] = 0
                    action['attack'] = 1
                    action['sprint'] = 0
                else:
                    action['camera'] = [0, 0]
                    action['jump'] = 1
                    action['forward'] = 1
                    action['attack'] = 1
                    action['sprint'] = 0
        
            action['back'] = 0
            action['left'] = 0
            action['right'] = 0

            obs1, reward, done, info = env.step(action)
        
            obs_convert = obs['pov'].astype(np.float32) / 255.0 - 0.5
            obs1_convert = obs1['pov'].astype(np.float32) / 255.0 - 0.5
            ep_history.append([obs_convert, action_index, reward, obs1_convert])
            obs = obs1
            net_reward += reward
        
            if done == True:
                print("Total reward: ", net_reward)
                print("e: ", e)
                
                #Update the network.
                ep_history = np.array(ep_history)
                ep_history[:,2] = discount_rewards(ep_history[:,2])
            
                #print("ep_history[:,0]: " + str(ep_history[:,0]))
                feed_dict={reward_holder:ep_history[:,2],
                           action_holder:ep_history[:,1], state:np.stack(ep_history[:,0], 0)}
                grads = sess.run(gradients, feed_dict=feed_dict)
                for idx, grad in enumerate(grads):
                    gradBuffer[idx] += grad

                #if i % update_frequency == 0 and i != 0:
                feed_dict = dictionary = dict(zip(gradient_holders,gradBuffer))
                _ = sess.run(update_batch, feed_dict=feed_dict)
                for ix, grad in enumerate(gradBuffer):
                    gradBuffer[ix] = grad * 0
                
                model_path = '/home/kimbring2/MineRL_Git/model/' + env_name
                saver.save(sess, model_path + '/model-' + str(i) + '.cptk')
                print("Saved Model")
                print("")
                
                total_reward.append(net_reward)
                #total_length.append(j)
                e = e * 0.999
                break
        
            #Update our running tally of scores.
            #if i % 100 == 0:
            #print(np.mean(total_reward[-100:]))
            #i += 1

Loading Model...
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from /home/kimbring2/MineRL_Git/model/MineRLTreechop-v0/model-114300.cptk
Total reward:  12.0
e:  0.05
Saved Model

Total reward:  7.0
e:  0.04995
Saved Model

Total reward:  17.0
e:  0.04990005
Saved Model

Total reward:  21.0
e:  0.04985014995
Saved Model

Total reward:  7.0
e:  0.04980029980005
Saved Model

Total reward:  15.0
e:  0.04975049950024995
Instructions for updating:
Use standard file APIs to delete files with this prefix.
Saved Model

Total reward:  17.0
e:  0.0497007490007497
Saved Model

Total reward:  19.0
e:  0.04965104825174895
Saved Model

Total reward:  10.0
e:  0.049601397203497204
Saved Model

Total reward:  8.0
e:  0.04955179580629371
Saved Model

Total reward:  15.0
e:  0.04950224401048742
Saved Model

Total reward:  10.0
e:  0.04945274176647693
Saved Model

Total reward:  24.0
e:  0.04940328902471045
Saved Model

Total re