In [None]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm
import glob
import random
from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple
import gym
import minerl
import os
import cv2
import tqdm
import tensorflow_probability as tfp

tfd = tfp.distributions
from IPython.display import clear_output

#gpus = tf.config.experimental.list_physical_devices('GPU')
#tf.config.experimental.set_virtual_device_configuration(gpus[0],
#            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

workspace_path = '/home/kimbring2/minecraft_ai'

writer = tf.summary.create_file_writer(workspace_path + "/tensorboard")

env = gym.make('MineRLNavigateDense-v0')

2021-10-18 04:52:21.570215: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-10-18 04:52:22.968496: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-10-18 04:52:22.968999: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-10-18 04:52:22.998811: E tensorflow/stream_executor/cuda/cuda_driver.cc:328] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-10-18 04:52:22.998877: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: kimbring2-GF75-Thin-10UEK
2021-10-18 04:52:22.998892: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: kimbring2-GF75-Thin-10UEK
2021-10-18 04:52:22.999031: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 460.91.3
2021-

In [None]:
class ActorCritic(tf.keras.Model):
  """Combined actor-critic network."""
  def __init__(
      self, 
      num_actions: int, 
      num_hidden_units: int):
    """Initialize."""
    super().__init__()

    self.num_actions = num_actions
    
    self.conv_1 = layers.Conv2D(16, 8, 4, padding="valid", activation="relu", kernel_regularizer='l2')
    self.conv_2 = layers.Conv2D(32, 4, 2, padding="valid", activation="relu", kernel_regularizer='l2')
    self.conv_3 = layers.Conv2D(32, 3, 1, padding="valid", activation="relu", kernel_regularizer='l2')
    
    self.lstm = layers.LSTM(128, return_sequences=True, return_state=True, kernel_regularizer='l2')
    
    self.common = layers.Dense(num_hidden_units, activation="relu", kernel_regularizer='l2')
    self.actor = layers.Dense(num_actions, kernel_regularizer='l2')
    self.critic = layers.Dense(1, kernel_regularizer='l2')

  def get_config(self):
    config = super().get_config().copy()
    config.update({
        'num_actions': self.num_actions,
        'num_hidden_units': self.num_hidden_units
    })
    return config
    
  def call(self, inputs: tf.Tensor, memory_state: tf.Tensor, carry_state: tf.Tensor, training) -> Tuple[tf.Tensor, tf.Tensor, 
                                                                                                        tf.Tensor, tf.Tensor]:
    batch_size = tf.shape(inputs)[0]

    conv_1 = self.conv_1(inputs)
    conv_2 = self.conv_2(conv_1)
    conv_3 = self.conv_3(conv_2)
    conv_3_reshaped = layers.Reshape((4*4,32))(conv_3)
    
    initial_state = (memory_state, carry_state)
    lstm_output, final_memory_state, final_carry_state  = self.lstm(conv_3_reshaped, initial_state=initial_state, 
                                                                    training=training)
    #lstm_output = conv_3_reshaped
    X_input = layers.Flatten()(lstm_output)
    x = self.common(X_input)
    
    return self.actor(x), self.critic(x), memory_state, carry_state

In [None]:
num_actions = 3
num_hidden_units = 512

model = ActorCritic(num_actions, num_hidden_units)

In [None]:
def discount_rewards(reward, dones):
    # Compute the gamma-discounted rewards over an episode
    gamma = 0.99    # discount rate
    running_add = 0
    discounted_r = np.zeros_like(reward)
    for i in reversed(range(0, len(reward))):
        running_add = running_add * gamma * (1 - dones[i]) + reward[i]
        discounted_r[i] = running_add

    if np.std(discounted_r) != 0:
        discounted_r -= np.mean(discounted_r) # normalizing the result
        discounted_r /= np.std(discounted_r) # divide by standard deviation

    return discounted_r


def take_vector_elements(vectors, indices):
    return tf.gather_nd(vectors, tf.stack([tf.range(tf.shape(vectors)[0]), indices], axis=1))


def render(obs):
    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR)
    cv2.imshow('obs', obs)
    cv2.waitKey(1)

In [None]:
mse_loss = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(0.0001)

@tf.function
def get_loss(state_array, action_list, memory_state, carry_state, discounted_r):
    batch_size = state_array.shape[0]
    
    policies = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
    values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
    for i in tf.range(0, batch_size):
        model_input = tf.expand_dims(state_array[i,:,:,:], 0)
        
        prediction = model(model_input, memory_state, carry_state, training=True)
        policy = prediction[0]
        value = prediction[1]
        memory_state = prediction[2]
        carry_state = prediction[3]

        policies = policies.write(i, policy[0])
        values = values.write(i, tf.squeeze(value))
            
    policies = policies.stack()
    values = values.stack()
    
    policies_softmax = tf.nn.softmax(policies)
    
    policies_selected = take_vector_elements(policies, action_list)

    discounted_r = tf.cast(discounted_r, 'float32')
    
    #print("discounted_r: ", discounted_r)
    #print("values: ", values)
    advantages = discounted_r - values
    
    logits_selected = tf.nn.softmax(policies_selected)
    logits_selected_logs = tf.math.log(logits_selected)
    
    #print("logits_selected_logs: ", logits_selected_logs)
    #print("advantages: ", advantages)
    actor_loss = -tf.math.reduce_mean(logits_selected_logs * tf.stop_gradient(advantages)) 
    actor_loss = tf.cast(actor_loss, 'float32')
    
    critic_loss = tf.reduce_mean(tf.square(values - discounted_r))
    critic_loss = tf.cast(critic_loss, 'float32')
    #tf.print("critic_loss: ", critic_loss)
    
    entropy_loss = -tf.math.reduce_mean(policies_softmax * tf.math.log(policies_softmax))
    #tf.print("entropy_loss: ", entropy_loss)
    
    total_loss = actor_loss + 0.5 * critic_loss - 0.01 * entropy_loss
    #tf.print("total_loss: ", total_loss)
    #tf.print("")

    return total_loss, memory_state, carry_state


def reinforcement_replay(state_list, action_list, memory_state, carry_state, reward_list, done_list):
    state_array = tf.concat(state_list, 0)
    action_array = tf.concat(action_list, 0)
    
    memory_state = tf.concat(memory_state, 0)
    carry_state = tf.concat(carry_state, 0)
    
    discounted_r_array = discount_rewards(reward_list, done_list)
    discounted_r = tf.concat(discounted_r_array, 0)
    
    divide_size = 64
    batch_size = state_array.shape[0]
    epoch_size = batch_size // divide_size
    remain_size = batch_size - epoch_size * divide_size
    for e in range(0, epoch_size):
        with tf.GradientTape() as tape:
            total_loss, memory_state, carry_state = get_loss(state_array[divide_size*e:divide_size*(e+1),:,:,:], 
                                                             action_array[divide_size*e:divide_size*(e+1)], 
                                                             memory_state, carry_state,
                                                             discounted_r[divide_size*e:divide_size*(e+1)])
            #print("total_loss: ", total_loss)
        
        grads = tape.gradient(total_loss, model.trainable_variables)
        optimizer.apply_gradients(zip(grads, model.trainable_variables))
        
    if remain_size != 0:
        with tf.GradientTape() as tape:
            total_loss, _, _ = get_loss(state_array[divide_size*epoch_size:divide_size*epoch_size+remain_size,:,:,:], 
                                        action_array[divide_size*epoch_size:divide_size*epoch_size+remain_size],
                                        memory_state, carry_state,
                                        discounted_r[divide_size*epoch_size:divide_size*epoch_size+remain_size])
                
            grads = tape.gradient(total_loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

In [None]:
def reinforcement_train(training_episode):
    total_reward, done, SAVING = 0, False, ''
    obs = env.reset()
    
    state_list, action_list, reward_list, done_list = [], [], [], []
    
    memory_state = tf.zeros([1,128], dtype=np.float32)
    carry_state = tf.zeros([1,128], dtype=np.float32)
    
    initial_memory_state = memory_state
    initial_carry_state = carry_state
    while True:
        render(obs['pov'])
        
        pov_array = obs['pov'] / 255.0
        
        compassAngle_array = obs['compass']['angle'] / 360.0
        compassAngle_array = np.ones((64,64,1)) * compassAngle_array
        #print("compassAngle_array: ", compassAngle_array)
        
        state_array = np.concatenate((pov_array, compassAngle_array), 2)
        state_array = np.expand_dims(state_array, 0)
        #print("input_array: ", input_array)
        
        prediction = model(state_array, memory_state, carry_state, training=False)
        act_pi = prediction[0]
        memory_state = prediction[2]
        carry_state = prediction[3]
        
        action_index = tf.random.categorical(act_pi, 1)
        action_index = int(action_index)
        #print("action_index: ", action_index)
        
        action = env.action_space.noop()
        if (action_index == 0):
            action['camera'] = [0, -2]
        elif (action_index == 1):
            action['camera'] = [0, 2]
        elif (action_index == 2):
            action['forward'] = 1
            action['jump'] = 1
            action['attack'] = 1
            
        obs_1, reward, done, info = env.step(action)
        
        total_reward += reward
        
        state_list.append(state_array)
        action_list.append(action_index)
        reward_list.append(reward)
        done_list.append(done)
        
        obs = obs_1
        if done:
            print("total_reward: ", total_reward)
            
            with writer.as_default():
                tf.summary.scalar("total_reward", total_reward, step=training_episode)
                writer.flush()
                
            break
        
    reinforcement_replay(state_list, action_list, initial_memory_state, initial_carry_state, 
                         reward_list, done_list)

    state_list, action_list, reward_list, done_list = [], [], [], []
    clear_output(wait=True)

            
max_episodes = 200000
with tqdm.trange(max_episodes) as t:
  for i in t:
    #print("i: ", i)
    reinforcement_train(i)
    
    if i % 100 == 0:
        model.save_weights(workspace_path + '/model/' + str(i))
        clear_output(wait=True)

  0%|                                  | 3/200000 [05:21<5561:36:39, 100.11s/it]

In [None]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm
import glob
import random
import cv2

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple
import tensorflow_probability as tfp

tfd = tfp.distributions

import gym
import minerl

model.load_weights(workspace_path + "/model/supervised_model_12000")

# Create the environment
env = gym.make('MineRLNavigateDense-v0')

seed = 980
env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

reward_sum = 0
for i_episode in range(0, 10000):
    observation = env.reset()
    
    pov_state = observation['pov'] / 255.0
    compassAngle = observation['compass']['angle'] / 360.0
    compassAngle_state = np.ones((64,64,1)) * compassAngle
        
    state = np.concatenate((pov_state, compassAngle_state), 2)
    state = tf.constant(state, dtype=tf.float32)
    
    memory_state = tf.zeros([1,128], dtype=np.float32)
    carry_state = tf.zeros([1,128], dtype=np.float32)
    step = 0
    while True:
        step += 1

        state = tf.expand_dims(state, 0)
        action_probs, _, memory_state, carry_state = model(state, memory_state, carry_state)
        
        action_dist = tfd.Categorical(probs=action_probs)
        action_index = int(action_dist.sample()[0])
        
        action = env.action_space.noop()
        if (action_index == 0):
            action['camera'] = [0, -5]
        elif (action_index == 1):
            action['camera'] = [0, 5]
        elif (action_index == 2):
            action['forward'] = 1
        elif (action_index == 3):
            action['jump'] = 1
            
        observation_1, reward, done, info = env.step(action)
        render(observation_1['pov'])
        
        pov_next_state = observation_1['pov'] / 255.0
        compassAngle = observation_1['compass']['angle'] / 360.0
        compassAngle_next_state = np.ones((64,64,1)) * compassAngle
        
        next_state = np.concatenate((pov_next_state, compassAngle_next_state), 2)
        next_state = tf.constant(next_state, dtype=tf.float32)
        
        reward_sum += reward

        state = next_state
        if done:
            print("Total reward: {:.2f},  Total step: {:.2f}".format(reward_sum, step))
            step = 0
            reward_sum = 0  
            #observation = env.reset()
            break

env.close()