In [1]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm
import glob
import random
from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple
import gym
import minerl
import os
import cv2
import tqdm
import tensorflow_probability as tfp

tfd = tfp.distributions
from IPython.display import clear_output

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

workspace_path = '/home/kimbring2/minecraft_ai'

writer = tf.summary.create_file_writer(workspace_path + "/tensorboard")

env = gym.make('MineRLNavigateDense-v0')

2021-10-20 00:57:14.179986: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-10-20 00:57:15.692808: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-10-20 00:57:15.693337: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-10-20 00:57:15.720979: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-20 00:57:15.721205: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: GeForce RTX 3060 Laptop GPU computeCapability: 8.6
coreClock: 1.402GHz coreCount: 30 deviceMemorySize: 5.81GiB deviceMemoryBandwidth: 268.26GiB/s
2021-10-20 00:57:15.721223: I tensorflow/stream_executor/platfor

In [2]:
class ActorCritic(tf.keras.Model):
  """Combined actor-critic network."""
  def __init__(
      self, 
      num_actions: int, 
      num_hidden_units: int):
    """Initialize."""
    super().__init__()

    self.num_actions = num_actions
    
    self.conv_1 = layers.Conv2D(16, 8, 4, padding="valid", activation="relu", kernel_regularizer='l2')
    self.conv_2 = layers.Conv2D(32, 4, 2, padding="valid", activation="relu", kernel_regularizer='l2')
    self.conv_3 = layers.Conv2D(32, 3, 1, padding="valid", activation="relu", kernel_regularizer='l2')
    
    self.lstm = layers.LSTM(128, return_sequences=True, return_state=True, kernel_regularizer='l2')
    
    self.common = layers.Dense(num_hidden_units, activation="relu", kernel_regularizer='l2')
    self.actor = layers.Dense(num_actions, kernel_regularizer='l2')
    self.critic = layers.Dense(1, kernel_regularizer='l2')

  def get_config(self):
    config = super().get_config().copy()
    config.update({
        'num_actions': self.num_actions,
        'num_hidden_units': self.num_hidden_units
    })
    return config
    
  def call(self, inputs: tf.Tensor, memory_state: tf.Tensor, carry_state: tf.Tensor, training) -> Tuple[tf.Tensor, tf.Tensor, 
                                                                                                        tf.Tensor, tf.Tensor]:
    batch_size = tf.shape(inputs)[0]

    conv_1 = self.conv_1(inputs)
    conv_2 = self.conv_2(conv_1)
    conv_3 = self.conv_3(conv_2)
    conv_3_reshaped = layers.Reshape((4*4,32))(conv_3)
    
    initial_state = (memory_state, carry_state)
    lstm_output, final_memory_state, final_carry_state  = self.lstm(conv_3_reshaped, initial_state=initial_state, 
                                                                    training=training)
    #lstm_output = conv_3_reshaped
    X_input = layers.Flatten()(lstm_output)
    x = self.common(X_input)
    
    return self.actor(x), self.critic(x), memory_state, carry_state

In [3]:
num_actions = 3
num_hidden_units = 512

model = ActorCritic(num_actions, num_hidden_units)

memory = []

In [4]:
def discount_rewards(reward, dones):
    # Compute the gamma-discounted rewards over an episode
    gamma = 0.99    # discount rate
    running_add = 0
    discounted_r = np.zeros_like(reward)
    for i in reversed(range(0, len(reward))):
        running_add = running_add * gamma * (1 - dones[i]) + reward[i]
        discounted_r[i] = running_add

    if np.std(discounted_r) != 0:
        discounted_r -= np.mean(discounted_r) # normalizing the result
        discounted_r /= np.std(discounted_r) # divide by standard deviation

    return discounted_r


def take_vector_elements(vectors, indices):
    return tf.gather_nd(vectors, tf.stack([tf.range(tf.shape(vectors)[0]), indices], axis=1))


def render(obs):
    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2BGR)
    cv2.imshow('obs', obs)
    cv2.waitKey(1)

In [5]:
mse_loss = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(0.0001)

@tf.function
def update(states, actions, agent_policies, rewards, dones, memory_states, carry_states):
    states = tf.convert_to_tensor(states, dtype=tf.float32)
    actions = tf.convert_to_tensor(actions, dtype=tf.int32)
    agent_policies = tf.convert_to_tensor(agent_policies, dtype=tf.float32)
    rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
    dones = tf.convert_to_tensor(dones, dtype=tf.bool)
    memory_states = tf.convert_to_tensor(memory_states, dtype=tf.float32)
    carry_states = tf.convert_to_tensor(carry_states, dtype=tf.float32)
    
    batch_size = states.shape[0]
    
    online_variables = model.trainable_variables
    with tf.GradientTape() as tape:
        tape.watch(online_variables)
        
        learner_policies = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        learner_values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        
        memory_state = tf.expand_dims(memory_states[0], 0)
        carry_state = tf.expand_dims(carry_states[0], 0)
        for i in tf.range(0, batch_size):
            learner_output = model(tf.expand_dims(states[i,:,:,:], 0), memory_state, carry_state,
                                                  training=True)
            learner_policy = learner_output[0]
            learner_policy = tf.squeeze(learner_policy)
            learner_policies = learner_policies.write(i, learner_policy)
            
            learner_value = learner_output[1]
            learner_value = tf.squeeze(learner_value)
            learner_values = learner_values.write(i, learner_value)
            
            memory_state = learner_output[2]
            carry_state = learner_output[3]
        
        learner_policies = learner_policies.stack()
        learner_values = learner_values.stack()
        
        learner_logits = tf.nn.softmax(learner_policies[:-1])
        agent_logits = tf.nn.softmax(agent_policies[:-1])
         
        actions = actions[:-1]
        rewards = rewards[1:]
        dones = dones[1:]
        
        learner_logits = tf.nn.softmax(learner_policies[:-1])
            
        bootstrap_value = learner_values[-1]
        learner_values = learner_values[:-1]
            
        discounting = 0.99
        discounts = tf.cast(~dones, tf.float32) * discounting
            
        target_action_probs = take_vector_elements(learner_logits, actions)
        target_action_log_probs = tf.math.log(target_action_probs)

        behaviour_action_probs = take_vector_elements(agent_logits, actions)
        behaviour_action_log_probs = tf.math.log(behaviour_action_probs)

        lambda_ = 1.0

        log_rhos = target_action_log_probs - behaviour_action_log_probs

        log_rhos = tf.convert_to_tensor(log_rhos, dtype=tf.float32)
        discounts = tf.convert_to_tensor(discounts, dtype=tf.float32)
        rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
        values = tf.convert_to_tensor(learner_values, dtype=tf.float32)
        bootstrap_value = tf.convert_to_tensor(bootstrap_value, dtype=tf.float32)

        clip_rho_threshold = tf.convert_to_tensor(1.0, dtype=tf.float32)
        clip_pg_rho_threshold = tf.convert_to_tensor(1.0, dtype=tf.float32)

        rhos = tf.math.exp(log_rhos)

        clipped_rhos = tf.minimum(clip_rho_threshold, rhos, name='clipped_rhos')

        cs = tf.minimum(1.0, rhos, name='cs')
        cs *= tf.convert_to_tensor(lambda_, dtype=tf.float32)

        values_t_plus_1 = tf.concat([values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
        deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values)

        acc = tf.zeros_like(bootstrap_value)
        vs_minus_v_xs = []
        for i in range(int(discounts.shape[0]) - 1, -1, -1):
            discount, c, delta = discounts[i], cs[i], deltas[i]
            acc = delta + discount * c * acc
            vs_minus_v_xs.append(acc)  

        vs_minus_v_xs = vs_minus_v_xs[::-1]

        vs = tf.add(vs_minus_v_xs, values, name='vs')
        vs_t_plus_1 = tf.concat([vs[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
        clipped_pg_rhos = tf.minimum(clip_pg_rho_threshold, rhos, name='clipped_pg_rhos')

        pg_advantages = (clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values))

        vs = tf.stop_gradient(vs)
        pg_advantages = tf.stop_gradient(pg_advantages)

        actor_loss = -tf.reduce_mean(target_action_log_probs * pg_advantages)

        baseline_cost = 0.5
        v_error = values - vs
        critic_loss = baseline_cost * 0.5 * tf.reduce_mean(tf.square(v_error))

        total_loss = actor_loss + critic_loss

    grads = tape.gradient(total_loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))


def reinforcement_replay():
    batch_size = 64
    
    memory_len = len(memory)
    if len(memory) > batch_size:
        start_index = random.randint(0, memory_len - batch_size)
        minibatch = memory[start_index:start_index+batch_size]
    else:
        return
    
    states = np.zeros((batch_size, 64, 64, 4), dtype=np.float32)
    actions = np.zeros(batch_size, dtype=np.int32)
    policies = np.zeros((batch_size, num_actions), dtype=np.float32)
    rewards = np.zeros(batch_size, dtype=np.float32)
    dones = np.zeros(batch_size, dtype=np.bool)
    memory_states = np.zeros((batch_size, 128), dtype=np.float32)
    carry_states = np.zeros((batch_size, 128), dtype=np.float32)
      
    for i in range(len(minibatch)):
        states[i] = minibatch[i][0]
        actions[i] = minibatch[i][1]
        policies[i] = minibatch[i][2]
        rewards[i] = minibatch[i][3]
        dones[i] = minibatch[i][4]
        memory_states[i] = minibatch[i][5]
        carry_states[i] = minibatch[i][6]

    update(states, actions, policies, rewards, dones, memory_states, carry_states)

In [6]:
def reinforcement_train(training_episode):
    total_reward, done, SAVING = 0, False, ''
    obs = env.reset()
    
    state_list, action_list, policy_list, reward_list, done_list = [], [], [], [], []
    
    memory_state = tf.zeros([1,128], dtype=tf.dtypes.float32)
    carry_state = tf.zeros([1,128], dtype=tf.dtypes.float32)
    
    total_step = 0
    while True:
        render(obs['pov'])
        
        pov_array = obs['pov'] / 255.0
        
        compassAngle_array = obs['compass']['angle'] / 360.0
        compassAngle_array = np.ones((64,64,1)) * compassAngle_array
        
        state_array = np.concatenate((pov_array, compassAngle_array), 2)
        state_array = np.expand_dims(state_array, 0)
        
        prediction = model(state_array, memory_state, carry_state, training=False)
        act_pi = prediction[0]
        next_memory_state = prediction[2]
        next_carry_state = prediction[3]
        
        action_index = tf.random.categorical(act_pi, 1)
        action_index = int(action_index)
        
        action = env.action_space.noop()
        if (action_index == 0):
            action['camera'] = [0, -2]
        elif (action_index == 1):
            action['camera'] = [0, 2]
        elif (action_index == 2):
            action['forward'] = 1
            action['jump'] = 1
            action['attack'] = 1
            
        obs_1, reward, done, info = env.step(action)
        
        total_reward += reward
        
        experience = state_array, action_index, act_pi, reward, done, memory_state, carry_state
        memory.append((experience))
        
        obs = obs_1
        memory_state = next_memory_state
        carry_state = next_carry_state
        if done:
            print("total_reward: ", total_reward)
            
            with writer.as_default():
                tf.summary.scalar("total_reward", total_reward, step=training_episode)
                writer.flush()
                
            break
        
        if total_step % 10 == 0:
            # train model
            reinforcement_replay()

        total_step += 1
            
    state_list, action_list, policy_list, reward_list, done_list = [], [], [], [], []
    
    clear_output(wait=True)

            
max_episodes = 200000
with tqdm.trange(max_episodes) as t:
  for i in t:
    #print("i: ", i)
    reinforcement_train(i)
    
    if i % 100 == 0:
        model.save_weights(workspace_path + '/model/' + str(i))
        clear_output(wait=True)

  0%|                                 | 1/200000 [07:45<25873:23:18, 465.72s/it]


KeyboardInterrupt: 

In [None]:
import collections
import gym
import numpy as np
import statistics
import tensorflow as tf
import tqdm
import glob
import random
import cv2

from matplotlib import pyplot as plt
from tensorflow.keras import layers
from typing import Any, List, Sequence, Tuple
import tensorflow_probability as tfp

tfd = tfp.distributions

import gym
import minerl

model.load_weights(workspace_path + "/model/supervised_model_12000")

# Create the environment
env = gym.make('MineRLNavigateDense-v0')

seed = 980
env.seed(seed)
tf.random.set_seed(seed)
np.random.seed(seed)

reward_sum = 0
for i_episode in range(0, 10000):
    observation = env.reset()
    
    pov_state = observation['pov'] / 255.0
    compassAngle = observation['compass']['angle'] / 360.0
    compassAngle_state = np.ones((64,64,1)) * compassAngle
        
    state = np.concatenate((pov_state, compassAngle_state), 2)
    state = tf.constant(state, dtype=tf.float32)
    
    memory_state = tf.zeros([1,128], dtype=np.float32)
    carry_state = tf.zeros([1,128], dtype=np.float32)
    step = 0
    while True:
        step += 1

        state = tf.expand_dims(state, 0)
        action_probs, _, memory_state, carry_state = model(state, memory_state, carry_state)
        
        action_dist = tfd.Categorical(probs=action_probs)
        action_index = int(action_dist.sample()[0])
        
        action = env.action_space.noop()
        if (action_index == 0):
            action['camera'] = [0, -5]
        elif (action_index == 1):
            action['camera'] = [0, 5]
        elif (action_index == 2):
            action['forward'] = 1
        elif (action_index == 3):
            action['jump'] = 1
            
        observation_1, reward, done, info = env.step(action)
        render(observation_1['pov'])
        
        pov_next_state = observation_1['pov'] / 255.0
        compassAngle = observation_1['compass']['angle'] / 360.0
        compassAngle_next_state = np.ones((64,64,1)) * compassAngle
        
        next_state = np.concatenate((pov_next_state, compassAngle_next_state), 2)
        next_state = tf.constant(next_state, dtype=tf.float32)
        
        reward_sum += reward

        state = next_state
        if done:
            print("Total reward: {:.2f},  Total step: {:.2f}".format(reward_sum, step))
            step = 0
            reward_sum = 0  
            #observation = env.reset()
            break

env.close()