# Simple Imitation Learning in MineRL
This tutorial contains a simple example of how to build a imitation-learning based agent that can solve the MineRLNavigateDense-v0 environment. For more information about that environment, see this [MineRL Env Docs](http://minerl.io/docs/environments/index.html#minerlnavigatedense-v0).

Parts of this tutorial are based on code by Arthur Juliani [Policy Gradient](https://medium.com/@awjuliani/super-simple-reinforcement-learning-tutorial-part-2-ded33892c724), [Actor Critic](https://github.com/awjuliani/DeepRL-Agents/blob/master/A3C-Doom.ipynb).

In [5]:
from __future__ import division

import numpy as np
import tensorflow as tf
import tensorflow.contrib.slim as slim
%matplotlib inline
import matplotlib.pyplot as plt
import math

try:
    xrange = xrange
except:
    xrange = range
    
#env_name = 'MineRLNavigateDense-v0'
env_name = 'MineRLTreechop-v0'
data_path = '/media/kimbring2/6224AA7924AA5039/minerl_data'

# Setting up our Neural Network agent
This time we will be using a Convolutional Neural Network that takes observations, passes them through a single hidden layer and Recurrent Neural Network and produces a probability of choosing a Jump and Camera movement. 

In [10]:
H = 256
batch_size_value = 1

tf.reset_default_graph()

if (env_name == 'MineRLTreechop-v0'):
    state = tf.placeholder(shape=[None,64,64,3], dtype=tf.float32)
elif (env_name == 'MineRLNavigateDense-v0'):
    state = tf.placeholder(shape=[None,64,64,4], dtype=tf.float32)

conv1 = slim.conv2d(inputs=state, num_outputs=32, kernel_size=[8,8], stride=[4,4], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
conv2 = slim.conv2d(inputs=conv1, num_outputs=64, kernel_size=[4,4], stride=[2,2], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)
conv3 = slim.conv2d(inputs=conv2, num_outputs=64, kernel_size=[3,3], stride=[1,1], padding='VALID', 
                    biases_initializer=None, activation_fn=tf.nn.relu)

hidden = slim.fully_connected(slim.flatten(conv3), 256, activation_fn=tf.nn.elu)
            
# Recurrent network for temporal dependencies
lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(256, state_is_tuple=True)
c_init = np.zeros((1,lstm_cell.state_size.c), np.float32)
h_init = np.zeros((1,lstm_cell.state_size.h), np.float32)
state_init_self = [c_init, h_init]
c_in = tf.placeholder(tf.float32, [1,lstm_cell.state_size.c])
h_in = tf.placeholder(tf.float32, [1,lstm_cell.state_size.h])
state_in_self = (c_in, h_in)
rnn_in = tf.expand_dims(hidden, [0])
step_size = tf.shape(state)[:1]
state_in = tf.nn.rnn_cell.LSTMStateTuple(c_in, h_in)
lstm_outputs, lstm_state = tf.nn.dynamic_rnn(lstm_cell, rnn_in, initial_state=state_in, 
                                             sequence_length=step_size, time_major=False)
lstm_c, lstm_h = lstm_state
state_out_self = (lstm_c[:1,:], lstm_h[:1,:])
rnn_out = tf.reshape(lstm_outputs, [-1,256])
#print("rnn_out: " + str(rnn_out))

if (env_name == 'MineRLTreechop-v0'):
    W = tf.get_variable("W", shape=[H,6],
               initializer=tf.contrib.layers.xavier_initializer())
    score = tf.matmul(rnn_out, W)
    probability = tf.nn.softmax(score)
    real_action = tf.placeholder(shape=[None,6], dtype=tf.int32)
elif (env_name == 'MineRLNavigateDense-v0'):
    W = tf.get_variable("W", shape=[H,4],
               initializer=tf.contrib.layers.xavier_initializer())
    score = tf.matmul(rnn_out, W)
    probability = tf.nn.softmax(score)
    real_action = tf.placeholder(shape=[None,4], dtype=tf.int32)
    
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=real_action, 
                                                              logits=score))
tf.summary.scalar('loss', loss)
train_step = tf.train.AdamOptimizer(0.0001).minimize(loss)

merged = tf.summary.merge_all()

rnn_out: Tensor("Reshape:0", shape=(?, 256), dtype=float32)


In [11]:
'''
    (noop) : 'MineRLNavigateDense-v0'
    {'jump': 0, 'camera': [0, 0]} = 3
    {'jump': 1, 'camera': [0, 0]} = 4
    {'jump': 0, 'camera': [0, -10]} = 1
    {'jump': 0, 'camera': [0, 10]} = 2
            
    (noop) : 'MineRLTreechop-v0'
    {'forward': 0, 'jump': 0, 'camera': [0, 0]} = 5
    {'forward': 1, 'jump': 1, 'camera': [0, 0]} = 6
    {'forward': 1, 'jump': 0, 'camera': [0, -10]} = 1
    {'forward': 1, 'jump': 0, 'camera': [0, 10]} = 2
    {'forward': 1, 'jump': 0, 'camera': [-10, 0]} = 3
    {'forward': 1, 'jump': 0, 'camera': [10, 0]} = 4
''' 

"\n    (noop) : 'MineRLNavigateDense-v0'\n    {'jump': 0, 'camera': [0, 0]} = 3\n    {'jump': 1, 'camera': [0, 0]} = 4\n    {'jump': 0, 'camera': [0, -10]} = 1\n    {'jump': 0, 'camera': [0, 10]} = 2\n            \n    (noop) : 'MineRLTreechop-v0'\n    {'forward': 0, 'jump': 0, 'camera': [0, 0]} = 5\n    {'forward': 1, 'jump': 1, 'camera': [0, 0]} = 6\n    {'forward': 1, 'jump': 0, 'camera': [0, -10]} = 1\n    {'forward': 1, 'jump': 0, 'camera': [0, 10]} = 2\n    {'forward': 1, 'jump': 0, 'camera': [-10, 0]} = 3\n    {'forward': 1, 'jump': 0, 'camera': [10, 0]} = 4\n"

# Train
MineRL package provides a human playing dataset for improving effiency of traning. At first, we are going to train our network by this dataset and use pretrained network for Reinforcement Learning. I assure it will reduce traing time tremendously. 

For more information about that dataset, see this [MineRL Dataset Docs](http://minerl.io/docs/tutorials/data_sampling.html).

In [None]:
import minerl
data = minerl.data.make(env_name, data_path)

init = tf.global_variables_initializer()
restore = False
with tf.Session() as sess:
    rendering = False
    sess.run(init)
    saver = tf.train.Saver(max_to_keep=5)
    train_writer = tf.summary.FileWriter('/home/kimbring2/MineRL/train_summary/' + env_name, sess.graph)
    
    if restore == True:
        path = '/home/kimbring2/MineRL/model/' + env_name
        ckpt = tf.train.get_checkpoint_state(path)
        saver.restore(sess, ckpt.model_checkpoint_path)
    
    episode_count = 0
    for current_state, action, reward, next_state, done in data.sarsd_iter(num_epochs=50, max_sequence_len=200):
        #print("current_state['pov'].shape: " + str(current_state['pov'].shape))
        length = (current_state['pov'].shape)[0]

        action_list = []
        states_list = []
        for i in range(0, length):
            if (env_name == 'MineRLTreechop-v0'):
                state_concat = current_state['pov'][i].astype(np.float32) / 255.0 - 0.5
            elif (env_name == 'MineRLNavigateDense-v0'):
                pov = current_state['pov'][i].astype(np.float32) / 255.0 - 0.5
                compass = current_state['compassAngle'][i]
                compass_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32) * compass
                compass_channel /= 180.0
        
                state_concat = np.concatenate([pov, compass_channel], axis=-1)
            
            if (env_name == 'MineRLNavigateDense-v0'):
                if (action['camera'][i][1] < 0):
                    action_ = [1, 0, 0, 0]
                elif (action['camera'][i][1] > 0):
                    action_ = [0, 1, 0, 0]
                else:
                    if (action['jump'][i] == 0):
                        action_ = [0, 0, 1, 0]
                    else:
                        action_ = [0, 0, 0, 1]
            elif (env_name == 'MineRLTreechop-v0'):
                if (action['camera'][i][1] < 0):
                    action_ = [1, 0, 0, 0, 0, 0]
                elif (action['camera'][i][1] > 0):
                    action_ = [0, 1, 0, 0, 0, 0]
                elif (action['camera'][i][0] < 0):
                    action_ = [0, 0, 1, 0, 0, 0]
                elif (action['camera'][i][0] > 0):
                    action_ = [0, 0, 0, 1, 0, 0]
                else:
                    if ( (action['jump'][i] == 0) & (action['forward'][i] == 0) ):
                        action_ = [0, 0, 0, 0, 1, 0]
                    elif ( (action['jump'][i] == 1) & (action['forward'][i] == 1) ):
                        action_ = [0, 0, 0, 0, 0, 1]
            
            states_list.append(state_concat)
            action_list.append(action_)
        
        episode_count = episode_count + 1
        
        rnn_state = state_init_self
        batch_rnn_state = rnn_state
        #state_train = (np.zeros([1,H]), np.zeros([1,H]))
        feed_dict = {state:np.stack(states_list, 0),
                     real_action:np.stack(action_list, 0),
                     state_in_self[0]:rnn_state[0],
                     state_in_self[1]:rnn_state[1]
                    }
        
        if episode_count % 100 == 0:
            summary, _ = sess.run([merged, train_step], feed_dict=feed_dict)
            train_writer.add_summary(summary, episode_count)

        sess.run(train_step, feed_dict=feed_dict)
        
        if episode_count % 100 == 0:
            model_path = '/home/kimbring2/MineRL/model/' + env_name
            saver.save(sess, model_path + '/model-' + str(episode_count) + '.cptk')
            print("Saved Model")

Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Save

Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Saved Model
Save

# Test
After traning a network, we should check it learns skill of human players. MineRL package also provides game flatform for that. 

See this docs [MineRL Agent Docs](http://minerl.io/docs/tutorials/first_agent.html).

In [None]:
import minerl
import gym
import os
from env_wrappers import ObtainPoVWrapper, ContinuingTimeLimitMonitor


env = gym.make(env_name)
#env = ObtainPoVWrapper(env)
#env = ContinuingTimeLimitMonitor(env, os.path.join('/home/kimbring2/MineRL/', 'monitor'), mode='evaluation', 
#                                 video_callable=lambda episode_id: True)

#obs = env.reset()

init = tf.global_variables_initializer()
with tf.Session() as sess:
# Launch the graph
    sess.run(init)
    saver = tf.train.Saver(max_to_keep=5)
    
    print('Loading Model...') 
    path = '/home/kimbring2/MineRL/model/' + env_name
    ckpt = tf.train.get_checkpoint_state(path)
    saver.restore(sess, ckpt.model_checkpoint_path)
    
    env.init()
    obs = env.reset()
    net_reward = 0
    state_temp = (np.zeros([1,H]),np.zeros([1,H]))
    while True:
        if (env_name == 'MineRLTreechop-v0'):
            state_concat = obs['pov'].astype(np.float32) / 255.0 - 0.5
        elif (env_name == 'MineRLNavigateDense-v0'):
            pov = obs['pov'].astype(np.float32) / 255.0 - 0.5
            compass = obs['compassAngle']
            compass_channel = np.ones(shape=list(pov.shape[:-1]) + [1], dtype=np.float32) * compass
            compass_channel /= 180.0
            state_concat = np.concatenate([pov, compass_channel], axis=-1)
            
        action_probability = sess.run(probability, feed_dict={state:[state_concat], 
                                                              trainLength:1,
                                                              state_in:state_temp,
                                                              batch_size:1})

        action = env.action_space.noop()
        if (env_name == 'MineRLNavigateDense-v0'):
            if (np.argmax(action_probability) == 0):
                action['camera'] = [0, -10]
                action['jump'] = 0
                action['forward'] = 1
                action['sprint'] = 1
            elif (np.argmax(action_probability) == 1):
                action['camera'] = [0, 10]
                action['jump'] = 0
                action['forward'] = 1
                action['sprint'] = 1
            elif (np.argmax(action_probability) == 2):
                action['camera'] = [0, 0]
                action['jump'] = 0
                action['forward'] = 1
                action['sprint'] = 1
            else:
                action['camera'] = [0, 0]
                action['jump'] = 1
                action['forward'] = 1
                action['sprint'] = 1
        elif (env_name == 'MineRLTreechop-v0'):
            if (np.argmax(action_probability) == 0):
                action['camera'] = [0, -10]
                action['jump'] = 0
                action['forward'] = 1
                action['attack'] = 1
                action['sprint'] = 0
            elif (np.argmax(action_probability) == 1):
                action['camera'] = [0, 10]
                action['jump'] = 0
                action['forward'] = 1
                action['attack'] = 1
                action['sprint'] = 0
            elif (np.argmax(action_probability) == 2):
                action['camera'] = [-10, 0]
                action['jump'] = 0
                action['forward'] = 1
                action['attack'] = 1
                action['sprint'] = 0
            elif (np.argmax(action_probability) == 3):
                action['camera'] = [10, 0]
                action['jump'] = 0
                action['forward'] = 1
                action['attack'] = 1
                action['sprint'] = 0
            elif (np.argmax(action_probability) == 4):
                action['camera'] = [0, 0]
                action['jump'] = 0
                action['forward'] = 0
                action['attack'] = 1
                action['sprint'] = 0
            else:
                action['camera'] = [0, 0]
                action['jump'] = 1
                action['forward'] = 1
                action['attack'] = 1
                action['sprint'] = 0
        
        action['back'] = 0
        action['left'] = 0
        action['right'] = 0

        obs, reward, done, info = env.step(action)
        
        if done == True:
            #env.close()
            break

        net_reward += reward
        print("Total reward: ", net_reward)