In [1]:
# Tutorial by www.pylessons.com
# Tutorial written for - Tensorflow 2.3.1

import os
import random
import gym
import pylab
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten, LSTM, Reshape
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import cv2
import threading
from threading import Thread, Lock
import time
import tensorflow_probability as tfp
from typing import Any, List, Sequence, Tuple
import deepmind_lab

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_virtual_device_configuration(gpus[0],
            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=4000)])
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
#os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # -1:cpu, 0:first gpu


tfd = tfp.distributions


class ActorCritic(tf.keras.Model):
    def __init__(self, input_shape, action_space):
        super(ActorCritic, self).__init__()
        
        self.conv_1 = Conv2D(4, 3, 2, padding="valid", activation="relu")
        self.conv_2 = Conv2D(8, 3, 2, padding="valid", activation="relu")
        
        self.lstm = LSTM(128, name="lstm", return_sequences=True, return_state=True)
        
        self.flatten = Flatten()
        self.reshape = Reshape((15*15,8))
        self.dense_0 = Dense(512, activation='relu')
        self.dense_1 = Dense(action_space)
        self.dense_2 = Dense(1)
        
    def call(self, X_input, memory_state, carry_state):
        conv_1 = self.conv_1(X_input)
        conv_2 = self.conv_2(conv_1)
        #print("conv_2.shape: ", conv_2.shape)
        
        conv_2_flattend = self.flatten(conv_2)
        conv_2_reshaped = self.reshape(conv_2_flattend)
        
        initial_state = (memory_state, carry_state)
        lstm_output, next_memory_state, next_carry_state = self.lstm(conv_2_reshaped, initial_state)
        
        X_input = self.flatten(lstm_output)
        X_input = self.dense_0(X_input)

        action_logit = self.dense_1(X_input)
        value = self.dense_2(X_input)
        
        return action_logit, value, next_memory_state, next_carry_state


def safe_log(x):
  """Computes a safe logarithm which returns 0 if x is zero."""
  return tf.where(
      tf.math.equal(x, 0),
      tf.zeros_like(x),
      tf.math.log(tf.math.maximum(1e-12, x)))


def take_vector_elements(vectors, indices):
    """
    For a batch of vectors, take a single vector component
    out of each vector.
    Args:
      vectors: a [batch x dims] Tensor.
      indices: an int32 Tensor with `batch` entries.
    Returns:
      A Tensor with `batch` entries, one for each vector.
    """
    return tf.gather_nd(vectors, tf.stack([tf.range(tf.shape(vectors)[0]), indices], axis=1))


huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
sparse_ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
mse_loss = tf.keras.losses.MeanSquaredError()

workspace_path = '/home/kimbring2/stabilizing_transformers'
writer = tf.summary.create_file_writer(workspace_path + "/tensorboard/pong")

def _action(*entries):
  return np.array(entries, dtype=np.intc)

ACTIONS = {
      'look_left': _action(-20, 0, 0, 0, 0, 0, 0),
      'look_right': _action(20, 0, 0, 0, 0, 0, 0),
      'look_up': _action(0, 10, 0, 0, 0, 0, 0),
      'look_down': _action(0, -10, 0, 0, 0, 0, 0),
      'strafe_left': _action(0, 0, -1, 0, 0, 0, 0),
      'strafe_right': _action(0, 0, 1, 0, 0, 0, 0),
      'forward': _action(0, 0, 0, 1, 0, 0, 0),
      'backward': _action(0, 0, 0, -1, 0, 0, 0),
      'fire': _action(0, 0, 0, 0, 1, 0, 0),
      'jump': _action(0, 0, 0, 0, 0, 1, 0),
      'crouch': _action(0, 0, 0, 0, 0, 0, 1)
  }

ACTION_LIST = list(ACTIONS)

class A3CAgent:
    # Actor-Critic Main Optimization Algorithm
    def __init__(self, env_name):
        # Initialization
        # Environment and PPO parameters
        self.env_name = env_name       
        self.env = deepmind_lab.Lab(self.env_name, ['RGB_INTERLEAVED'],
                       {'fps': '15', 'width': '64', 'height': '64'})
        #env.reset(seed=1)
        
        self.action_size = len(ACTION_LIST)
        self.EPISODES, self.episode, self.max_average = 20000, 0, -21.0 # specific for pong
        self.lock = Lock()
        self.learning_rate = 0.0001

        self.ROWS = 64
        self.COLS = 64
        self.REM_STEP = 3

        # Instantiate plot memory
        self.scores, self.episodes, self.average = [], [], []

        self.Save_Path = 'Models'
        self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
        
        if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
        self.path = '{}_A3C_{}'.format(self.env_name, self.learning_rate)
        self.model_name = os.path.join(self.Save_Path, self.path)

        # Create Actor-Critic network model
        self.policy = ActorCritic(input_shape=self.state_size, action_space=self.action_size)
        self.policy_old = ActorCritic(input_shape=self.state_size, action_space=self.action_size)
        
        self.optimizer = tf.keras.optimizers.Adam(self.learning_rate)

    @tf.function
    def act_old(self, state, memory_state, carry_state, training):
        # Use the network to predict the next action to take, using the model
        prediction = self.policy_old(state, memory_state, carry_state, training=training)
        action = tf.random.categorical(prediction[0], 1)
        
        return action[0][0], prediction
    
    @tf.function
    def act_new(self, state, memory_state, carry_state, training):
        # Use the network to predict the next action to take, using the model
        prediction = self.policy(state, memory_state, carry_state, training=training)
        action = tf.random.categorical(prediction[0], 1)
        
        return action[0][0], prediction

    def discount_rewards(self, reward):
        # Compute the gamma-discounted rewards over an episode
        gamma = 0.99    # discount rate
        running_add = 0
        discounted_r = np.zeros_like(reward)
        for i in reversed(range(0, len(reward))):
            if reward[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
                running_add = 0

            running_add = running_add * gamma + reward[i]
            discounted_r[i] = running_add

        if np.std(discounted_r) != 0:
            discounted_r -= np.mean(discounted_r) # normalizing the result
            discounted_r /= np.std(discounted_r) # divide by standard deviation

        return discounted_r
    
    def get_loss(self, states, actions, discounted_r, initial_memory_state, initial_carry_state):
        batch_size = states.shape[0]
        
        #states = np.vstack(states)
        
        action_logits_old = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        values_old = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        
        action_logits = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
        
        memory_state_old = initial_memory_state
        carry_state_old = initial_carry_state
        
        memory_state = initial_memory_state
        carry_state = initial_carry_state
        for i in range(0, batch_size):
            #print("states[i].shape: ", states[i].shape)
            #prediction_old = self.policy_old(tf.expand_dims(states[i,:], 0), training=True)
            action_old, prediction_old = agent.act_old(tf.expand_dims(states[i,:], 0), 
                                                       memory_state_old, carry_state_old,
                                                       training=True)
            action_logit_old = prediction_old[0]
            value_old = prediction_old[1]
            memory_state_old = prediction_old[2]
            carry_state_old = prediction_old[3]

            action_logits_old = action_logits_old.write(i, action_logit_old[0])
            #print("tf.squeeze(value_old): ", tf.squeeze(value_old))
            values_old = values_old.write(i,  tf.squeeze(value_old))
            
            #prediction = self.policy(tf.expand_dims(states[i,:], 0), training=True)
            action, prediction = agent.act_new(tf.expand_dims(states[i,:], 0), 
                                               memory_state, carry_state,
                                               training=True)
            action_logit = prediction[0]
            value = prediction[1]
            memory_state = prediction[2]
            carry_state = prediction[3]
            
            action_logits = action_logits.write(i, action_logit[0])
            values = values.write(i,  tf.squeeze(value))
            
        action_logits_old = action_logits_old.stack()
        values_old = values_old.stack()
        action_logits = action_logits.stack()
        values = values.stack()
        
        action_logits_selected_old = take_vector_elements(action_logits_old, actions)
        
        action_logits_selected = take_vector_elements(action_logits, actions)
        
        advantages = discounted_r - tf.stop_gradient(values_old)
            
        entropy_loss_old = tf.keras.losses.categorical_crossentropy(action_logits_old, action_logits_old)
            
        action_logits_selected_old = tf.nn.softmax(action_logits_selected_old)
        action_log_selected_old = tf.math.log(action_logits_selected_old)
        
        action_logits_selected = tf.nn.softmax(action_logits_selected)
        action_log_selected = tf.math.log(action_logits_selected)
            
        ratios = tf.math.exp(action_log_selected - tf.stop_gradient(action_log_selected_old))
            
        eps_clip = 0.2 
        surr1 = ratios * advantages
        surr2 = tf.clip_by_value(ratios, 1 - eps_clip, 1 + eps_clip) * advantages
            
        actor_loss = -tf.math.minimum(surr1, surr2)
        actor_loss = tf.math.reduce_mean(actor_loss)
        actor_loss = tf.cast(actor_loss, 'float32')
        #print("actor_loss: ", actor_loss)
            
        critic_loss = mse_loss(values, discounted_r)
        total_loss = actor_loss + critic_loss
        
        return total_loss
        
    def replay(self, states, actions, rewards, initial_memory_state, initial_carry_state):
        # reshape memory to appropriate shape for training
        states = np.vstack(states)
        
        # Compute discounted rewards
        discounted_r = self.discount_rewards(rewards)
        discounted_r = discounted_r.astype(np.float32)
        with tf.GradientTape() as tape:
            total_loss = self.get_loss(states, actions, discounted_r, initial_memory_state, initial_carry_state)
                 
        #print("total_loss: ", total_loss)
        #print("")
            
        grads = tape.gradient(total_loss, self.policy.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.policy.trainable_variables))
 
        for a, b in zip(self.policy_old.variables, self.policy.variables):
            a.assign(b)  # copies the variables of model_b into model_a
        
        return total_loss
        
    def load(self, model_name):
        self.ActorCritic = load_model(model_name, compile=False)
        #self.Critic = load_model(Critic_name, compile=False)

    def save(self):
        self.ActorCritic.save(self.model_name)
        #self.Critic.save(self.Model_name + '_Critic.h5')

    def GetScoreAverage(self, score, episode):
        self.scores.append(score)
        self.episodes.append(episode)
        self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
        
        return self.average[-1]
    
    def imshow(self, image, rem_step=0):
        cv2.imshow(self.model_name + str(rem_step), image[rem_step,...])
        if cv2.waitKey(25) & 0xFF == ord("q"):
            cv2.destroyAllWindows()
            return

    def reset(self, env):
        image_memory = np.zeros(self.state_size)
        env.reset()
        state = env.observations()  
        
        return state
    
    def step(self, action, env, image_memory):
        #action = random.choice(ACTION_LIST)
        reward = env.step(ACTIONS[ACTION_LIST[action]])
        if not env.is_running():
            print('Environment stopped early')
            return 0, 0, True
        
        next_state = env.observations()
        self.render(next_state)
        
        return next_state, reward, False
    
    def train(self, n_threads):
        self.env.close()
        # Instantiate one environment per thread
        envs = [deepmind_lab.Lab(self.env_name, ['RGB_INTERLEAVED'], 
                                 {'fps': '15', 'width': '64', 'height': '64'}) for i in range(n_threads)]
        # Create threads
        threads = [threading.Thread(
                target=self.train_threading,
                daemon=True,
                args=(self,
                    envs[i],
                    i)) for i in range(n_threads)]

        for t in threads:
            time.sleep(2)
            t.start()
            
        for t in threads:
            time.sleep(10)
            t.join()
            
    def render(self, obs):
        cv2.imshow('obs', obs['RGB_INTERLEAVED'])
        cv2.waitKey(1)
            
    def train_threading(self, agent, env, thread):
        total_step = 0
        while self.episode < self.EPISODES:
            # Reset episode
            score, done, SAVING = 0, False, ''
            state = self.reset(env)
            state = state['RGB_INTERLEAVED'].astype(np.float32)
            #print("state.shape: ", state.shape)

            states, actions, rewards = [], [], []
            
            memory_state = np.zeros([1,128], dtype=np.float32)
            carry_state = np.zeros([1,128], dtype=np.float32)
            
            initial_memory_state = memory_state
            initial_carry_state = carry_state
            step = 0
            while True:
                #print("state.shape: ", state.shape)
                action, prediction = agent.act_old(np.expand_dims(state, 0), memory_state, carry_state, 
                                                   training=False)
                #print("prediction: ", prediction)
                memory_state = prediction[2]
                carry_state = prediction[3]
                #print("action: ", action)
                next_state, reward, done = self.step(action.numpy(), env, state)
                if done == True:
                    break
                
                next_state = next_state['RGB_INTERLEAVED'].astype(np.float32)                    
                states.append(np.expand_dims(state, 0))
                actions.append(action)
                rewards.append(reward)

                score += reward
                state = next_state
                step += 1
                #if len(states) == 256:
                
            print("step: ", step)
            self.lock.acquire()
            self.replay(states, actions, rewards, initial_memory_state, initial_carry_state)
            self.lock.release()

            states, actions, rewards = [], [], []
                    
            # Update episode count
            with self.lock:
                average = self.GetScoreAverage(score, self.episode)
                with writer.as_default():
                    # other model code would go here
                    tf.summary.scalar("average", average, step=self.episode)
                    writer.flush()
                
                # saving best models
                if average >= self.max_average:
                    self.max_average = average
                    #self.save()
                    SAVING = "SAVING"
                else:
                    SAVING = ""

                print("episode: {}/{}, thread: {}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, thread, score, average, SAVING))
                if(self.episode < self.EPISODES):
                    self.episode += 1

        env.close()            

    def test(self, Actor_name, Critic_name):
        self.load(Actor_name, Critic_name)
        for e in range(100):
            state = self.reset(self.env)
            done = False
            score = 0
            while not done:
                self.env.render()
                action = np.argmax(self.Actor.predict(state))
                state, reward, done, _ = self.step(action, self.env, state)
                score += reward
                if done:
                    print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
                    break

        self.env.close()


if __name__ == "__main__":
    env_name = 'ctf_simple'
    #env_name = 'Pong-v0'
    agent = A3CAgent(env_name)
    
    #agent.run() # use as A2C
    agent.train(n_threads=1) # use as A3C
    #agent.test('Models/Pong-v0_A3C_2.5e-05_Actor.h5', '')

Exception in thread Thread-4:
Traceback (most recent call last):
  File "/usr/lib/python3.8/threading.py", line 932, in _bootstrap_inner
    self.run()
  File "/usr/lib/python3.8/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-1-d33ea1ffe2fe>", line 366, in train_threading
  File "/home/kimbring2/.local/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 885, in __call__
    result = self._call(*args, **kwds)
  File "/home/kimbring2/.local/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 933, in _call
    self._initialize(args, kwds, add_initializers_to=initializers)
  File "/home/kimbring2/.local/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py", line 759, in _initialize
    self._stateful_fn._get_concrete_function_internal_garbage_collected(  # pylint: disable=protected-access
  File "/home/kimbring2/.local/lib/python3.8/site-packages/tensorflow/python/eager/functi

KeyboardInterrupt: 