In [1]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm
import glob
import random
from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple
import gym
import minerl
import os
import cv2
import tqdm
import tensorflow_probability as tfp

tfd = tfp.distributions
from IPython.display import clear_output

#gpus = tf.config.experimental.list_physical_devices('GPU')
#tf.config.experimental.set_virtual_device_configuration(gpus[0],
#            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

workspace_path = '/home/kimbring2/minecraft_ai'

writer = tf.summary.create_file_writer(workspace_path + "/tensorboard")

env = gym.make('MineRLNavigateDense-v0')

2021-10-17 04:42:15.665275: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-10-17 04:42:17.659360: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-10-17 04:42:17.660030: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-10-17 04:42:17.682747: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-10-17 04:42:17.682772: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: kimbring2-GF75-Thin-10UEK
2021-10-17 04:42:17.682776: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: kimbring2-GF75-Thin-10UEK
2021-10-17 04:42:17.682837: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 460.91.3
2021-

In [2]:
class ActorCritic(tf.keras.Model):
  """Combined actor-critic network."""
  def __init__(
      self, 
      num_actions: int, 
      num_hidden_units: int):
    """Initialize."""
    super().__init__()

    self.num_actions = num_actions
    
    self.conv_1 = layers.Conv2D(16, 8, 4, padding="valid", activation="relu", kernel_regularizer='l2')
    self.conv_2 = layers.Conv2D(32, 4, 2, padding="valid", activation="relu", kernel_regularizer='l2')
    self.conv_3 = layers.Conv2D(32, 3, 1, padding="valid", activation="relu", kernel_regularizer='l2')
    
    self.lstm = layers.LSTM(128, return_sequences=True, return_state=True, kernel_regularizer='l2')
    
    self.common = layers.Dense(num_hidden_units, activation="relu", kernel_regularizer='l2')
    self.actor = layers.Dense(num_actions, kernel_regularizer='l2')
    self.critic = layers.Dense(1, kernel_regularizer='l2')

  def get_config(self):
    config = super().get_config().copy()
    config.update({
        'num_actions': self.num_actions,
        'num_hidden_units': self.num_hidden_units
    })
    return config
    
  def call(self, inputs: tf.Tensor, memory_state: tf.Tensor, carry_state: tf.Tensor, training) -> Tuple[tf.Tensor, tf.Tensor, 
                                                                                                        tf.Tensor, tf.Tensor]:
    batch_size = tf.shape(inputs)[0]

    conv_1 = self.conv_1(inputs)
    conv_2 = self.conv_2(conv_1)
    conv_3 = self.conv_3(conv_2)
    #print("conv_3.shape: ", conv_3.shape)
    conv_3_reshaped = layers.Reshape((4*4,32))(conv_3)
    
    initial_state = (memory_state, carry_state)
    #print("initial_state: ", initial_state)
    lstm_output, final_memory_state, final_carry_state  = self.lstm(conv_3_reshaped, initial_state=initial_state, 
                                                                    training=training)
    
    X_input = layers.Flatten()(lstm_output)
    x = self.common(X_input)
    
    return tf.keras.layers.Softmax()(self.actor(x)), self.critic(x), final_memory_state, final_carry_state

In [3]:
num_actions = 20
num_hidden_units = 512

model = ActorCritic(num_actions, num_hidden_units)

In [4]:
def discount_rewards(reward, dones):
    # Compute the gamma-discounted rewards over an episode
    gamma = 0.99    # discount rate
    running_add = 0
    discounted_r = np.zeros_like(reward)
    for i in reversed(range(0, len(reward))):
        running_add = running_add * gamma * (1 - dones[i]) + reward[i]
        discounted_r[i] = running_add

    if np.std(discounted_r) != 0:
        discounted_r -= np.mean(discounted_r) # normalizing the result
        discounted_r /= np.std(discounted_r) # divide by standard deviation

    return discounted_r


def take_vector_elements(vectors, indices):
    return tf.gather_nd(vectors, tf.stack([tf.range(tf.shape(vectors)[0]), indices], axis=1))


def render(obs):
    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR)
    cv2.imshow('obs', obs)
    cv2.waitKey(1)

In [5]:
mse_loss = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(0.0001)

@tf.function
def get_loss(input_array, action_list, memory_state, carry_state, discounted_r):
    batch_size = input_array.shape[0]
    
    action_logits = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
    values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
    for i in tf.range(0, batch_size):
        model_input = tf.expand_dims(input_array[i,:,:,:], 0)
        prediction = model(model_input, memory_state, carry_state, training=False)
        action_logit = prediction[0]
        value = prediction[1]
        memory_state = prediction[2]
        carry_state = prediction[3]

        action_logits = action_logits.write(i, action_logit[0])
        values = values.write(i, tf.squeeze(value))
            
    action_logits = action_logits.stack()
    values = values.stack()
    #tf.print("values: ", values)

    action_logits_selected = take_vector_elements(action_logits, action_list)

    #tf.print("discounted_r: ", discounted_r)
    #tf.print("values: ", values)
    discounted_r = tf.cast(discounted_r, 'float32')
    advantages = discounted_r - values
    #tf.print("advantages: ", advantages)
    #tf.print("advantages test: ", discounted_r_array - values)
            
    action_logits_selected_probs = tf.math.log(action_logits_selected)
        
    #tf.print("action_logits_selected_probs: ", action_logits_selected_probs)
    #tf.print("advantages: ", advantages)
    actor_loss = -tf.math.reduce_mean(action_logits_selected_probs * tf.stop_gradient(advantages)) 
    actor_loss = tf.cast(actor_loss, 'float32')
            
    critic_loss = mse_loss(values, discounted_r)
    critic_loss = tf.cast(critic_loss, 'float32')
        
    total_loss = actor_loss + 0.5 * critic_loss

    return total_loss, memory_state, carry_state


def reinforcement_replay(input_list, action_list, memory_state, carry_state, reward_list, done_list):
    input_array = tf.concat(input_list, 0)
    memory_state = tf.concat(memory_state, 0)
    carry_state = tf.concat(carry_state, 0)
    
    #print("action_list: ", action_list)
    action_array = tf.concat(action_list, 0)
    #print("action_array: ", action_array)

    discounted_r_array = discount_rewards(reward_list, done_list)
    discounted_r = tf.concat(discounted_r_array, 0)
    with tf.GradientTape() as tape:
        total_loss, memory_state, carry_state = get_loss(input_array, action_array, memory_state, 
                                                         carry_state, discounted_r)
        print("total_loss: ", total_loss)
        
    grads = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))

    return total_loss, memory_state, carry_state

In [None]:
def reinforcement_train(training_episode):
    total_reward, done, SAVING = 0, False, ''
    obs = env.reset()
    
    input_list, action_list, reward_list, done_list = [], [], [], []
    
    memory_state = tf.zeros([1,128], dtype=np.float32)
    carry_state = tf.zeros([1,128], dtype=np.float32)
    
    initial_memory_state = memory_state
    initial_carry_state = carry_state
    while True:
        render(obs['pov'])
        
        pov_array = obs['pov'] / 255.0
        
        compassAngle_array = obs['compass']['angle'] / 360.0
        compassAngle_array = np.ones((64,64,1)) * compassAngle_array
        
        input_array = np.concatenate((pov_array, compassAngle_array), 2)
        input_array = np.expand_dims(input_array, 0)
        
        prediction = model(input_array, memory_state, carry_state, training=False)
        act_pi = prediction[0]
        memory_state = prediction[2]
        carry_state = prediction[3]
        
        action_dist = tfd.Categorical(probs=act_pi)
        action_index = int(action_dist.sample()[0])
        
        action = env.action_space.noop()
        if (action_index == 0):
            action['camera'] = [0, -5]
            action['attack'] = 0
        elif (action_index == 1):
            action['camera'] = [0, -5]
            action['attack'] = 1
        elif (action_index == 2):
            action['camera'] = [0, 5]
            action['attack'] = 0
        elif (action_index == 3):
            action['camera'] = [0, 5]
            action['attack'] = 1
        elif (action_index == 4):
            action['camera'] = [-5, 0]
            action['attack'] = 0
        elif (action_index == 5):
            action['camera'] = [-5, 0]
            action['attack'] = 1
        elif (action_index == 6):
            action['camera'] = [5, 0]
            action['attack'] = 0
        elif (action_index == 7):
            action['camera'] = [5, 0]
            
        elif (action_index == 8):
            action['forward'] = 1
            action['attack'] = 0
        elif (action_index == 9):
            action['forward'] = 1
            action['attack'] = 1
            
        elif (action_index == 10):
            action['jump'] = 1
            action['attack'] = 0
        elif (action_index == 11):
            action['jump'] = 1
            action['attack'] = 1
            
        elif (action_index == 12):
            action['back'] = 1
            action['attack'] = 0
        elif (action_index == 13):
            action['back'] = 1
            action['attack'] = 1
            
        elif (action_index == 14):
            action['left'] = 1
            action['attack'] = 0
        elif (action_index == 15):
            action['left'] = 1
            action['attack'] = 1
            
        elif (action_index == 16):
            action['right'] = 1
            action['attack'] = 0
        elif (action_index == 17):
            action['right'] = 1
            action['attack'] = 1 
            
        elif (action_index == 18):
            action['sneak'] = 1
            action['attack'] = 0
        elif (action_index == 19):
            action['sneak'] = 1
            action['attack'] = 1 
        
        obs_1, reward, done, info = env.step(action)
        
        total_reward += reward
        
        input_list.append(input_array)
        action_list.append(action_index)
        reward_list.append(reward)
        done_list.append(done)
        
        obs = obs_1
        if done:
            print("total_reward: ", total_reward)
            
            with writer.as_default():
                tf.summary.scalar("total_reward", total_reward, step=training_episode)
                writer.flush()
                
            break
        
        if len(input_list) == 64:   
            total_loss, memory_state, carry_state = reinforcement_replay(input_list, action_list,
                                                                         initial_memory_state, 
                                                                         initial_carry_state,
                                                                         reward_list, done_list)

            input_list, action_list, reward_list, done_list = [], [], [], []
            
            initial_memory_state = memory_state
            initial_carry_state = carry_state
            
            #print("total_loss: ", total_loss)
            #print("")
            
        clear_output(wait=True)

            
max_episodes = 200000
with tqdm.trange(max_episodes) as t:
  for i in t:
    #print("i: ", i)
    reinforcement_train(i)
    
    if i % 100 == 0:
        model.save_weights(workspace_path + '/model/' + str(i))
        clear_output(wait=True)

In [None]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm
import glob
import random
import cv2

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple
import tensorflow_probability as tfp

tfd = tfp.distributions

import gym
import minerl

model.load_weights(workspace_path + "/model/supervised_model_12000")

# Create the environment
env = gym.make('MineRLTreechop-v0')

seed = 980
env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

reward_sum = 0
for i_episode in range(0, 10000):
    observation = env.reset()
    
    inventory_channel = np.zeros((64,64,1))
    if 'inventory' in observation:
        region_max_height = observation['pov'].shape[0]
        region_max_width = observation['pov'].shape[1]
        rs = 8
        if min(region_max_height, region_max_width) < rs:
            raise ValueError("'region_size' is too large.")
            
        num_element_width = region_max_width // rs

        inventory_channel = np.zeros(shape=list(observation['pov'].shape[:-1]) + [1], 
                                     dtype=observation['pov'].dtype)
        #print("state['inventory'].keys(): " + str(state['inventory'].keys()))
        for key_idx, key in enumerate(observation['inventory'].keys()):
            #print("key.shape : " + str(key))
            #print("state['inventory'][key][i] : " + str(state['inventory'][key][i]))
            item_scaled = np.clip(1 - 1 / (observation['inventory'][key] + 1),  # Inversed
                                  0, 1)
            #print("item_scaled : " + str(item_scaled))
            item_channel = np.ones(shape=[rs, rs, 1], dtype=observation['pov'].dtype) * item_scaled
            width_low = (key_idx % num_element_width) * rs
            height_low = (key_idx // num_element_width) * rs

            if height_low + rs > region_max_height:
                raise ValueError("Too many elements on 'inventory'. Please decrease 'region_size' of each component.")

            inventory_channel[height_low:(height_low + rs), width_low:(width_low + rs), :] = item_channel

    state = np.concatenate((observation['pov'] / 255.0, inventory_channel), axis=2)
    state = tf.constant(state, dtype=tf.float32)
    
    memory_state = tf.zeros([1,128], dtype=np.float32)
    carry_state = tf.zeros([1,128], dtype=np.float32)
    step = 0
    while True:
        step += 1

        state = tf.expand_dims(state, 0)
        action_probs, _, memory_state, carry_state = model(state, memory_state, carry_state)
        
        action_dist = tfd.Categorical(probs=action_probs)
        action_index = int(action_dist.sample()[0])
        #print("action_index: ", action_index)
        #if random.random() <= 0.01:
        #    action_index = random.randint(0,18)
        #else:
        #    action_index = np.argmax(np.squeeze(action_probs))
        #print("action_index: ", action_index)
        
        action = env.action_space.noop()
        if (action_index == 0):
            action['camera'] = [0, -5]
            action['attack'] = 0
        elif (action_index == 1):
            action['camera'] = [0, -5]
            action['attack'] = 1
        elif (action_index == 2):
            action['camera'] = [0, 5]
            action['attack'] = 0
        elif (action_index == 3):
            action['camera'] = [0, 5]
            action['attack'] = 1
        elif (action_index == 4):
            action['camera'] = [-5, 0]
            action['attack'] = 0
        elif (action_index == 5):
            action['camera'] = [-5, 0]
            action['attack'] = 1
        elif (action_index == 6):
            action['camera'] = [5, 0]
            action['attack'] = 0
        elif (action_index == 7):
            action['camera'] = [5, 0]
            
        elif (action_index == 8):
            action['forward'] = 1
            action['attack'] = 0
        elif (action_index == 9):
            action['forward'] = 1
            action['attack'] = 1
            
        elif (action_index == 10):
            action['jump'] = 1
            action['attack'] = 0
        elif (action_index == 11):
            action['jump'] = 1
            action['attack'] = 1
            
        elif (action_index == 12):
            action['back'] = 1
            action['attack'] = 0
        elif (action_index == 13):
            action['back'] = 1
            action['attack'] = 1
            
        elif (action_index == 14):
            action['left'] = 1
            action['attack'] = 0
        elif (action_index == 15):
            action['left'] = 1
            action['attack'] = 1
            
        elif (action_index == 16):
            action['right'] = 1
            action['attack'] = 0
        elif (action_index == 17):
            action['right'] = 1
            action['attack'] = 1 
            
        elif (action_index == 18):
            action['sneak'] = 1
            action['attack'] = 0
        elif (action_index == 19):
            action['sneak'] = 1
            action['attack'] = 1 
        
        observation_1, reward, done, info = env.step(action)
        render(observation_1['pov'])
        
        inventory_channel_1 = np.zeros((64,64,1))
        if 'inventory' in observation_1:
            region_max_height = observation_1['pov'].shape[0]
            region_max_width = observation_1['pov'].shape[1]
            rs = 8
            if min(region_max_height, region_max_width) < rs:
                raise ValueError("'region_size' is too large.")
                
            num_element_width = region_max_width // rs

            inventory_channel_1 = np.zeros(shape=list(observation_1['pov'].shape[:-1]) + [1], 
                                           dtype=observation_1['pov'].dtype)
            #print("state['inventory'].keys(): " + str(state['inventory'].keys()))
            for key_idx, key in enumerate(observation_1['inventory'].keys()):
                #print("key.shape : " + str(key))
                #print("state['inventory'][key][i] : " + str(state['inventory'][key][i]))
                item_scaled = np.clip(1 - 1 / (observation_1['inventory'][key] + 1),  # Inversed
                                      0, 1)
                #print("item_scaled : " + str(item_scaled))
                item_channel = np.ones(shape=[rs, rs, 1], dtype=observation_1['pov'].dtype) * item_scaled
                width_low = (key_idx % num_element_width) * rs
                height_low = (key_idx // num_element_width) * rs

                if height_low + rs > region_max_height:
                    raise ValueError("Too many elements on 'inventory'. Please decrease 'region_size' of each component.")

                inventory_channel_1[height_low:(height_low + rs), width_low:(width_low + rs), :] = item_channel

        next_state = np.concatenate((observation_1['pov'] / 255.0, inventory_channel_1), axis=2)
        next_state = tf.constant(next_state, dtype=tf.float32)
        
        reward_sum += reward

        state = next_state
        if done:
            print("Total reward: {:.2f},  Total step: {:.2f}".format(reward_sum, step))
            step = 0
            reward_sum = 0  
            #observation = env.reset()
            break

env.close()