In [1]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm
import glob
import random
from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple
import gym
import minerl
import os
from IPython.display import clear_output

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])

workspace_path = '/media/kimbring2/Steam/minerl_2021'

writer = tf.summary.create_file_writer(workspace_path + "/tensorboard")



In [2]:
class ActorCritic(tf.keras.Model):
  """Combined actor-critic network."""
  def __init__(
      self, 
      num_actions: int, 
      num_hidden_units: int):
    """Initialize."""
    super().__init__()

    self.num_actions = num_actions
    
    self.conv_1 = layers.Conv2D(16, 8, 4, padding="valid", activation="relu", kernel_regularizer='l2')
    self.conv_2 = layers.Conv2D(32, 4, 2, padding="valid", activation="relu", kernel_regularizer='l2')
    self.conv_3 = layers.Conv2D(32, 3, 1, padding="valid", activation="relu", kernel_regularizer='l2')
    
    self.lstm = layers.LSTM(128, return_sequences=True, return_state=True, kernel_regularizer='l2')
    
    self.common = layers.Dense(num_hidden_units, activation="relu", kernel_regularizer='l2')
    self.actor = layers.Dense(num_actions, kernel_regularizer='l2')
    self.critic = layers.Dense(1, kernel_regularizer='l2')

  def get_config(self):
    config = super().get_config().copy()
    config.update({
        'num_actions': self.num_actions,
        'num_hidden_units': self.num_hidden_units
    })
    return config
    
  def call(self, inputs: tf.Tensor, memory_state: tf.Tensor, carry_state: tf.Tensor, training) -> Tuple[tf.Tensor, tf.Tensor, 
                                                                                                        tf.Tensor, tf.Tensor]:
    batch_size = tf.shape(inputs)[0]

    conv_1 = self.conv_1(inputs)
    conv_2 = self.conv_2(conv_1)
    conv_3 = self.conv_3(conv_2)
    #print("conv_3.shape: ", conv_3.shape)
    conv_3_reshaped = layers.Reshape((4*4,32))(conv_3)
    
    initial_state = (memory_state, carry_state)
    #print("initial_state: ", initial_state)
    lstm_output, final_memory_state, final_carry_state  = self.lstm(conv_3_reshaped, initial_state=initial_state, 
                                                                    training=training)
    
    X_input = layers.Flatten()(lstm_output)
    x = self.common(X_input)
    
    return tf.keras.layers.Softmax()(self.actor(x)), self.critic(x), final_memory_state, final_carry_state

In [3]:
!export MINERL_DATA_ROOT=/media/kimbring2/6224AA7924AA5039/minerl_data/data

In [4]:
import gym
import minerl
import numpy as np

tree_data = minerl.data.make('MineRLTreechop-v0', data_dir="/media/kimbring2/6224AA7924AA5039/minerl_data/data")

class TrajetoryDataset(tf.data.Dataset):
    def _generator(num_trajectorys):
        while True:
            trajectory_names = tree_data.get_trajectory_names()
            #print("len(trajectory_names): ", len(trajectory_names))
            
            trajectory_name = random.choice(trajectory_names)
            print("trajectory_name: ", trajectory_name)
            
            trajectory = tree_data.load_data(trajectory_name, skip_interval=0, include_metadata=False)
            #print("trajectory: ", trajectory)
            
            all_actions = []
            all_obs = []
            for dataset_observation, dataset_action, reward, next_state, done in trajectory:  
                #state_pov = dataset_observation['pov']
                
                inventory_channel = np.zeros((64,64,1))
                if 'inventory' in dataset_observation:
                    region_max_height = dataset_observation['pov'].shape[0]
                    region_max_width = dataset_observation['pov'].shape[1]
                    rs = 8
                    if min(region_max_height, region_max_width) < rs:
                        raise ValueError("'region_size' is too large.")
                    num_element_width = region_max_width // rs

                    inventory_channel = np.zeros(shape=list(dataset_observation['pov'].shape[:-1]) + [1], 
                                                 dtype=dataset_observation['pov'].dtype)
                    #print("state['inventory'].keys(): " + str(state['inventory'].keys()))
                    for key_idx, key in enumerate(dataset_observation['inventory'].keys()):
                        #print("key.shape : " + str(key))
                        #print("state['inventory'][key][i] : " + str(state['inventory'][key][i]))
                        item_scaled = np.clip(1 - 1 / (dataset_observation['inventory'][key] + 1),  # Inversed
                                                0, 1)
                        #print("item_scaled : " + str(item_scaled))
                        item_channel = np.ones(shape=[rs, rs, 1], dtype=dataset_observation['pov'].dtype) * item_scaled
                        width_low = (key_idx % num_element_width) * rs
                        height_low = (key_idx // num_element_width) * rs

                        if height_low + rs > region_max_height:
                            raise ValueError("Too many elements on 'inventory'. Please decrease 'region_size' of each component.")

                        inventory_channel[height_low:(height_low + rs), width_low:(width_low + rs), :] = item_channel

                observation = np.concatenate((dataset_observation['pov'] / 255.0, inventory_channel), axis=2)

                action_camera_0 = dataset_action['camera'][0]
                action_camera_1 = dataset_action['camera'][1]
                action_attack = dataset_action['attack']
                action_forward = dataset_action['forward']
                action_jump = dataset_action['jump']
                action_back = dataset_action['back']
                action_left = dataset_action['left']
                action_right = dataset_action['right']
                action_sneak = dataset_action['sneak']

                camera_threshols = (abs(action_camera_0) + abs(action_camera_1)) / 2.0
                if (camera_threshols > 2.5):
                    if ( (action_camera_1 < 0) & ( abs(action_camera_0) < abs(action_camera_1) ) ):
                        if (action_attack == 0):
                            action_index = 0
                        else:
                            action_index = 1
                    elif ( (action_camera_1 > 0) & ( abs(action_camera_0) < abs(action_camera_1) ) ):
                        if (action_attack == 0):
                            action_index = 2
                        else:
                            action_index = 3
                    elif ( (action_camera_0 < 0) & ( abs(action_camera_0) > abs(action_camera_1) ) ):
                        if (action_attack == 0):
                            action_index = 4
                        else:
                            action_index = 5
                    elif ( (action_camera_0 > 0) & ( abs(action_camera_0) > abs(action_camera_1) ) ):
                        if (action_attack == 0):
                            action_index = 6
                        else:
                            action_index = 7

                elif (action_forward == 1):
                    if (action_attack == 0):
                        action_index = 8
                    else:
                        action_index = 9
                elif (action_jump == 1):
                    if (action_attack == 0):
                        action_index = 10
                    else:
                        action_index = 11
                elif (action_back == 1):
                    if (action_attack == 0):
                        action_index = 12
                    else:
                        action_index = 13
                elif (action_left == 1):
                    if (action_attack == 0):
                        action_index = 14
                    else:
                        action_index = 15
                elif (action_right == 1):
                    if (action_attack == 0):
                        action_index = 16
                    else:
                        action_index = 17
                elif (action_sneak == 1):
                    if (action_attack == 0):
                        action_index = 18
                    else:
                        action_index = 19
                else:
                    continue

                if (dataset_action['attack'] == 0 and dataset_action['back'] == 0 and dataset_action['camera'][0] == 0.0 and 
                    dataset_action['camera'][1] == 0.0 and dataset_action['forward'] == 0 and dataset_action['jump'] == 0 and 
                    dataset_action['left'] == 0 and dataset_action['right'] == 0 and dataset_action['sneak'] == 0):
                    #print("continue: ")
                    continue

                #print("observation.shape: ", observation.shape)
                #print("action_index: ", action_index)
                #print("done: ", done)

                all_obs.append(observation)
                all_actions.append(np.array([action_index]))

            print("len(all_obs): ", len(all_obs))
            print("")
            yield (all_obs, all_actions)

            break
    
    def __new__(cls, num_trajectorys=3):
      return tf.data.Dataset.from_generator(
          cls._generator,
          output_types=(tf.dtypes.float32, tf.dtypes.int32),
          args=(num_trajectorys,)
    )

dataset = tf.data.Dataset.range(1).interleave(TrajetoryDataset, 
  num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(1).prefetch(tf.data.experimental.AUTOTUNE)

Instructions for updating:
Use output_signature instead


In [5]:
num_actions = 20
num_hidden_units = 512

#model = tf.keras.models.load_model('MineRL_SL_Model')
model = ActorCritic(num_actions, num_hidden_units)
#model.load_weights("model/MineRL_SL_Model")

In [6]:
cce_loss = tf.keras.losses.CategoricalCrossentropy()
optimizer = tf.keras.optimizers.Adam(0.0001)

@tf.function
def supervised_replay(replay_obs_list, replay_act_list, memory_state, carry_state):
    replay_obs_array = tf.concat(replay_obs_list, 0)
    replay_act_array = tf.concat(replay_act_list, 0)
    replay_memory_state_array = tf.concat(memory_state, 0)
    replay_carry_state_array = tf.concat(carry_state, 0)

    memory_state = replay_memory_state_array
    carry_state = replay_carry_state_array

    batch_size = replay_obs_array.shape[0]
    tf.print("batch_size: ", batch_size)
    
    with tf.GradientTape() as tape:
        act_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        for i in tf.range(0, batch_size):
            prediction = model(tf.expand_dims(replay_obs_array[i,:,:,:], 0), memory_state, carry_state, training=True)
            act_pi = prediction[0]
            memory_state = prediction[2]
            carry_state = prediction[3]
        
            act_probs = act_probs.write(i, act_pi[0])

        act_probs = act_probs.stack()

        tf.print("replay_act_array: ", replay_act_array)
        tf.print("tf.argmax(act_probs, 1): ", tf.argmax(act_probs, 1))

        replay_act_array_onehot = tf.one_hot(replay_act_array, num_actions)
        replay_act_array_onehot = tf.reshape(replay_act_array_onehot, (batch_size, num_actions))
        act_loss = cce_loss(replay_act_array_onehot, act_probs)

        #tf.print("act_loss: ", act_loss)
        regularization_loss = tf.reduce_sum(model.losses)
        total_loss = act_loss + 1e-5 * regularization_loss
    
        #tf.print("total_loss: ", total_loss)
        #tf.print("")
        
    grads = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    return total_loss, memory_state, carry_state

In [None]:
def supervised_train(dataset, training_episode):
    for batch in dataset:
        episode_size = batch[0].shape[1]
        print("episode_size: ", episode_size)
    
        replay_obs_list = batch[0][0]
        replay_act_list = batch[1][0]
     
        memory_state = np.zeros([1,128], dtype=np.float32)
        carry_state =  np.zeros([1,128], dtype=np.float32)
        step_length = 32
        for episode_index in range(0, episode_size, step_length):
            obs = replay_obs_list[episode_index:episode_index+step_length,:,:,:]
            act = replay_act_list[episode_index:episode_index+step_length,:]
            
            #print("len(obs): ", len(obs))
            if len(obs) != step_length:
                break
            
            total_loss, next_memory_state, next_carry_state = supervised_replay(obs, act, memory_state, carry_state)
            memory_state = next_memory_state
            carry_state = next_carry_state
        
            print("total_loss: ", total_loss)
            print("")
            
        with writer.as_default():
            tf.summary.scalar("total_loss", total_loss, step=training_episode)
            writer.flush()

        if training_episode % 100 == 0:
            model.save_weights(workspace_path + '/model/supervised_model_' + str(training_episode))
            
        clear_output(wait=True)
        
for training_episode in range(0, 2000000):
    supervised_train(dataset, training_episode)

trajectory_name:  v3_remorseful_current_savage-1_4442-6943


100%|██████████| 2461/2461 [00:00<00:00, 8776.71it/s]


len(all_obs):  988

episode_size:  988
batch_size:  32
replay_act_array:  [[8]
 [8]
 [8]
 ...
 [8]
 [8]
 [8]]
tf.argmax(act_probs, 1):  [8 8 8 ... 8 8 8]
total_loss:  tf.Tensor(0.124798596, shape=(), dtype=float32)

batch_size:  32
replay_act_array:  [[8]
 [8]
 [8]
 ...
 [3]
 [3]
 [3]]
tf.argmax(act_probs, 1):  [8 8 8 ... 8 8 8]
total_loss:  tf.Tensor(1.4398816, shape=(), dtype=float32)

batch_size:  32
replay_act_array:  [[3]
 [3]
 [5]
 ...
 [8]
 [8]
 [8]]
tf.argmax(act_probs, 1):  [8 8 8 ... 8 8 8]
total_loss:  tf.Tensor(1.9246454, shape=(), dtype=float32)

batch_size:  32
replay_act_array:  [[8]
 [8]
 [8]
 ...
 [15]
 [15]
 [9]]
tf.argmax(act_probs, 1):  [8 8 8 ... 8 8 8]
total_loss:  tf.Tensor(1.3502458, shape=(), dtype=float32)

batch_size:  32
replay_act_array:  [[9]
 [7]
 [7]
 ...
 [3]
 [3]
 [3]]
tf.argmax(act_probs, 1):  [8 8 8 ... 6 8 8]
total_loss:  tf.Tensor(2.4858916, shape=(), dtype=float32)

batch_size:  32
replay_act_array:  [[2]
 [8]
 [8]
 ...
 [8]
 [8]
 [8]]
tf.argmax(a

In [None]:
def render(obs):
    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR)
    cv2.imshow('obs', obs)
    cv2.waitKey(1)

In [None]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm
import glob
import random
import cv2

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple
import tensorflow_probability as tfp

tfd = tfp.distributions

import gym
import minerl

model.load_weights(workspace_path + "/model/supervised_model_12000")

# Create the environment
env = gym.make('MineRLTreechop-v0')

seed = 980
env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

reward_sum = 0
for i_episode in range(0, 10000):
    observation = env.reset()
    
    inventory_channel = np.zeros((64,64,1))
    if 'inventory' in observation:
        region_max_height = observation['pov'].shape[0]
        region_max_width = observation['pov'].shape[1]
        rs = 8
        if min(region_max_height, region_max_width) < rs:
            raise ValueError("'region_size' is too large.")
            
        num_element_width = region_max_width // rs

        inventory_channel = np.zeros(shape=list(observation['pov'].shape[:-1]) + [1], 
                                     dtype=observation['pov'].dtype)
        #print("state['inventory'].keys(): " + str(state['inventory'].keys()))
        for key_idx, key in enumerate(observation['inventory'].keys()):
            #print("key.shape : " + str(key))
            #print("state['inventory'][key][i] : " + str(state['inventory'][key][i]))
            item_scaled = np.clip(1 - 1 / (observation['inventory'][key] + 1),  # Inversed
                                  0, 1)
            #print("item_scaled : " + str(item_scaled))
            item_channel = np.ones(shape=[rs, rs, 1], dtype=observation['pov'].dtype) * item_scaled
            width_low = (key_idx % num_element_width) * rs
            height_low = (key_idx // num_element_width) * rs

            if height_low + rs > region_max_height:
                raise ValueError("Too many elements on 'inventory'. Please decrease 'region_size' of each component.")

            inventory_channel[height_low:(height_low + rs), width_low:(width_low + rs), :] = item_channel

    state = np.concatenate((observation['pov'] / 255.0, inventory_channel), axis=2)
    state = tf.constant(state, dtype=tf.float32)
    
    memory_state = tf.zeros([1,128], dtype=np.float32)
    carry_state = tf.zeros([1,128], dtype=np.float32)
    step = 0
    while True:
        step += 1

        state = tf.expand_dims(state, 0)
        action_probs, _, memory_state, carry_state = model(state, memory_state, carry_state)
        
        action_dist = tfd.Categorical(probs=action_probs)
        action_index = int(action_dist.sample()[0])
        #print("action_index: ", action_index)
        #if random.random() <= 0.01:
        #    action_index = random.randint(0,18)
        #else:
        #    action_index = np.argmax(np.squeeze(action_probs))
        #print("action_index: ", action_index)
        
        action = env.action_space.noop()
        if (action_index == 0):
            action['camera'] = [0, -5]
            action['attack'] = 0
        elif (action_index == 1):
            action['camera'] = [0, -5]
            action['attack'] = 1
        elif (action_index == 2):
            action['camera'] = [0, 5]
            action['attack'] = 0
        elif (action_index == 3):
            action['camera'] = [0, 5]
            action['attack'] = 1
        elif (action_index == 4):
            action['camera'] = [-5, 0]
            action['attack'] = 0
        elif (action_index == 5):
            action['camera'] = [-5, 0]
            action['attack'] = 1
        elif (action_index == 6):
            action['camera'] = [5, 0]
            action['attack'] = 0
        elif (action_index == 7):
            action['camera'] = [5, 0]
            
        elif (action_index == 8):
            action['forward'] = 1
            action['attack'] = 0
        elif (action_index == 9):
            action['forward'] = 1
            action['attack'] = 1
            
        elif (action_index == 10):
            action['jump'] = 1
            action['attack'] = 0
        elif (action_index == 11):
            action['jump'] = 1
            action['attack'] = 1
            
        elif (action_index == 12):
            action['back'] = 1
            action['attack'] = 0
        elif (action_index == 13):
            action['back'] = 1
            action['attack'] = 1
            
        elif (action_index == 14):
            action['left'] = 1
            action['attack'] = 0
        elif (action_index == 15):
            action['left'] = 1
            action['attack'] = 1
            
        elif (action_index == 16):
            action['right'] = 1
            action['attack'] = 0
        elif (action_index == 17):
            action['right'] = 1
            action['attack'] = 1 
            
        elif (action_index == 18):
            action['sneak'] = 1
            action['attack'] = 0
        elif (action_index == 19):
            action['sneak'] = 1
            action['attack'] = 1 
        
        observation_1, reward, done, info = env.step(action)
        render(observation_1['pov'])
        
        inventory_channel_1 = np.zeros((64,64,1))
        if 'inventory' in observation_1:
            region_max_height = observation_1['pov'].shape[0]
            region_max_width = observation_1['pov'].shape[1]
            rs = 8
            if min(region_max_height, region_max_width) < rs:
                raise ValueError("'region_size' is too large.")
                
            num_element_width = region_max_width // rs

            inventory_channel_1 = np.zeros(shape=list(observation_1['pov'].shape[:-1]) + [1], 
                                           dtype=observation_1['pov'].dtype)
            #print("state['inventory'].keys(): " + str(state['inventory'].keys()))
            for key_idx, key in enumerate(observation_1['inventory'].keys()):
                #print("key.shape : " + str(key))
                #print("state['inventory'][key][i] : " + str(state['inventory'][key][i]))
                item_scaled = np.clip(1 - 1 / (observation_1['inventory'][key] + 1),  # Inversed
                                      0, 1)
                #print("item_scaled : " + str(item_scaled))
                item_channel = np.ones(shape=[rs, rs, 1], dtype=observation_1['pov'].dtype) * item_scaled
                width_low = (key_idx % num_element_width) * rs
                height_low = (key_idx // num_element_width) * rs

                if height_low + rs > region_max_height:
                    raise ValueError("Too many elements on 'inventory'. Please decrease 'region_size' of each component.")

                inventory_channel_1[height_low:(height_low + rs), width_low:(width_low + rs), :] = item_channel

        next_state = np.concatenate((observation_1['pov'] / 255.0, inventory_channel_1), axis=2)
        next_state = tf.constant(next_state, dtype=tf.float32)
        
        reward_sum += reward

        state = next_state
        if done:
            print("Total reward: {:.2f},  Total step: {:.2f}".format(reward_sum, step))
            step = 0
            reward_sum = 0  
            #observation = env.reset()
            break

env.close()

In [None]:
import minerl
import gym
import cv2
env = gym.make('MineRLNavigateDense-v0')


obs  = env.reset()
done = False
net_reward = 0


def render(obs):
    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR)
    cv2.imshow('obs', obs)
    cv2.waitKey(1)

    
while not done:
    action = env.action_space.noop()

    action['camera'] = [0, 0.03*obs["compass"]["angle"]]
    action['back'] = 0
    action['forward'] = 1
    action['jump'] = 1
    action['attack'] = 1

    obs, reward, done, info = env.step(action)
    #print("obs.keys(): ", obs.keys())
    render(obs['pov'])
    
    net_reward += reward
    print("Total reward: ", net_reward)