In [1]:
import os
import random
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten, LSTM
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import cv2
import threading
from threading import Thread, Lock
import time
import tensorflow_probability as tfp
from typing import Any, List, Sequence, Tuple
from Basic3DPlatformer import Basic3DPlatformer

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

tfd = tfp.distributions


class OurModel(tf.keras.Model):
    def __init__(self, action_space):
        super(OurModel, self).__init__()

        self.conv_1 = tf.keras.layers.Conv2D(24, 7, 4, padding="valid", activation="relu")
        self.conv_2 = tf.keras.layers.Conv2D(48, 3, 2, padding="valid", activation="relu")
        self.conv_3 = tf.keras.layers.Conv2D(96, 2, 1, padding="valid", activation="relu")
        self.conv_4 = tf.keras.layers.Conv2D(512, 6, 1, padding="valid", activation="relu")
    
        self.lstm = LSTM(128, return_sequences=True, return_state=True)
        
        self.dense_1 = Dense(action_space)
        self.dense_2 = Dense(1)
        
    def call(self, state, memory_state, carry_state):
        batch_size = state.shape[0]

        conv_1 = self.conv_1(state)
        conv_2 = self.conv_2(conv_1)
        conv_3 = self.conv_3(conv_2)
        conv_4 = self.conv_4(conv_3)
    
        conv_4_reshaped = tf.keras.layers.Reshape((8, 64))(conv_4)
        
        initial_state = (memory_state, carry_state)
        lstm_output, final_memory_state, final_carry_state = self.lstm(conv_4_reshaped, initial_state=initial_state)
        lstm_output_flattened = Flatten()(lstm_output)
        
        action_logit = self.dense_1(lstm_output_flattened)
        value = self.dense_2(lstm_output_flattened)
        
        return action_logit, value, final_memory_state, final_carry_state


sparse_ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
mse_loss = tf.keras.losses.MeanSquaredError()

ACTIONS = {
      'look_left': [-100, 0, 0, 0, 0],
      'look_right': [100, 0, 0, 0, 0],
      'strafe_left': [0, 0, -1, 0, 0],
      'strafe_right': [0, 0, 1, 0, 0],
      'forward': [0, 0, 0, 1, 0],
      'backward': [0, 0, 0, -1, 0]
}


class A3CAgent:
    def __init__(self, env_name):
        self.env_name = env_name
        
        GODOT_BIN_PATH = "basic_3d_platformer/Basic3DPlatformer.x86_64"
        env_abs_path = "basic_3d_platformer/Basic3DPlatformer.pck"
        
        self.env = Basic3DPlatformer(exec_path=GODOT_BIN_PATH, env_path=env_abs_path, turbo_mode=True)
        self.action_size = len(ACTIONS)
        self.EPISODES, self.episode, self.max_average = 2000000, 0, -21.0 # specific for pong
        self.lr = 0.0001

        self.scores, self.episodes, self.average = [], [], []

        self.Save_Path = 'Models'

        if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
        self.path = '{}_A3C_{}'.format(self.env_name, self.lr)
        self.model_name = os.path.join(self.Save_Path, self.path)

        # Create Actor-Critic network model
        self.ActorCritic = OurModel(action_space=self.action_size)
        self.optimizer = tf.keras.optimizers.Adam(self.lr)

        self.writer = tf.summary.create_file_writer("tensorboard")

    def act(self, state, memory_state, carry_state):
        memory_state = tf.constant(memory_state, tf.float32)
        carry_state = tf.constant(carry_state, tf.float32)
        
        prediction = self.ActorCritic(state, memory_state, carry_state, training=False)
        action = tf.random.categorical(prediction[0], 1).numpy()

        memory_state = prediction[2].numpy()
        carry_state = prediction[3].numpy()
        
        return action[0][0], memory_state, carry_state

    def discount_rewards(self, rewards):
        # Compute the gamma-discounted rewards over an episode
        gamma = 0.99    # discount rate
        running_add = 0
        discounted_r = np.zeros_like(rewards)
        for i in reversed(range(0, len(rewards))):
            if rewards[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
                running_add = 0

            running_add = running_add * gamma + rewards[i]
            discounted_r[i] = running_add

        if np.std(discounted_r) != 0.0:
            discounted_r -= np.mean(discounted_r) # normalizing the result
            discounted_r /= np.std(discounted_r) # divide by standard deviation

        return discounted_r

    def replay(self, states, actions, rewards):
        states = np.vstack(states)
        batch_size = states.shape[0]
        
        discounted_r = self.discount_rewards(rewards)
        discounted_r_ = np.vstack(discounted_r)
        with tf.GradientTape() as tape:
            action_logits = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
            values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
            
            memory_state = tf.zeros([1,128], dtype=np.float32)
            carry_state = tf.zeros([1,128], dtype=np.float32)
            for i in tf.range(0, batch_size):
                prediction = self.ActorCritic(tf.expand_dims(states[i], 0), memory_state, carry_state, training=True)
                
                action_logits = action_logits.write(i, prediction[0][0])
                values = values.write(i, prediction[1][0])
                
                memory_state = prediction[2]
                carry_state = prediction[3]
                
            action_logits = action_logits.stack()
            values = values.stack()
            
            advantages = discounted_r - np.stack(values)[:, 0] 
            
            action_probs = tf.nn.softmax(action_logits)
            dist = tfd.Categorical(probs=action_probs)
            action_log_prob = dist.prob(actions)
            action_log_prob = tf.math.log(action_log_prob)
            
            actor_loss = -tf.math.reduce_mean(action_log_prob * advantages) 
            
            critic_loss = mse_loss(values, discounted_r_)
            critic_loss = tf.cast(critic_loss, 'float32')
            total_loss = actor_loss + critic_loss
        
        grads = tape.gradient(total_loss, self.ActorCritic.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.ActorCritic.trainable_variables))

    def load(self, model_name):
        self.ActorCritic = load_model(model_name, compile=False)

    def save(self):
        self.ActorCritic.save(self.model_name)

    def PlotModel(self, score, episode):
        self.scores.append(score)
        self.episodes.append(episode)
        self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
        return self.average[-1]

    def reset(self):
        state = self.env.reset()
        state = np.reshape(state, (128, 128, 3))
        state = cv2.resize(state, (64, 64)) / 255.0
        
        return state

    def step(self, action):
        next_state, reward, done, info = self.env.step(action)
        next_state = np.reshape(next_state, (128, 128, 3)) 
        next_state = cv2.resize(next_state, (64, 64)) / 255.0
        
        return next_state, reward[0], done[0], info

    def train(self):
        while self.episode < self.EPISODES:
            # Reset episode
            score, done, SAVING = 0, False, ''
            state = self.reset()
            state = np.expand_dims(state, 0)

            states, actions, rewards = [], [], []
            
            memory_state = np.zeros([1,128], dtype=np.float32)
            carry_state = np.zeros([1,128], dtype=np.float32)
            #while not done:
            for step in range(0, 750):
                action_idx, memory_state, carry_state = agent.act(state, memory_state, carry_state)
                action_key = list(ACTIONS.keys())[action_idx]
                action = ACTIONS[action_key]
                
                next_state, reward, done, _ = self.step(action)
                next_state = np.expand_dims(next_state, 0)

                states.append(state)
                actions.append(action_idx)
                rewards.append(reward)

                score += reward
                state = next_state

            self.replay(states, actions, rewards)
            states, actions, rewards = [], [], []

            average = self.PlotModel(score, self.episode)
            
            if average >= self.max_average:
                self.max_average = average
                #self.save()
                SAVING = "SAVING"
            else:
                SAVING = ""

            print("episode: {}/{}, score: {}, average: {} {}".format(self.episode, self.EPISODES, score, average, SAVING))
            with self.writer.as_default():
                tf.summary.scalar("average_reward", average, step=self.episode)
                self.writer.flush()
            
            if self.episode < self.EPISODES:
                self.episode += 1

        env.close()

    def test(self, Actor_name, Critic_name):
        self.load(Actor_name, Critic_name)
        for e in range(100):
            state = self.reset(self.env)
            done = False
            score = 0
            while not done:
                self.env.render()
                action = np.argmax(self.Actor.predict(state))
                state, reward, done, _ = self.step(action, self.env, state)
                score += reward
                if done:
                    print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
                    break

        self.env.close()


if __name__ == "__main__":
    env_name = 'PongDeterministic-v4'
    agent = A3CAgent(env_name)
    agent.train()

2024-09-27 21:50:03.962687: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-27 21:50:03.962721: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-27 21:50:03.962756: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-27 21:50:03.970594: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


_GodotEnv.__file__:  /media/kimbring2/NewHDD/godot_env/lib/python3.9/site-packages/GodotEnv-0.1-py3.9-linux-x86_64.egg/_GodotEnv.cpython-39-x86_64-linux-gnu.so
Created float32 vector test agent_action_environment0 size = 5
Created int32 vector env_action_environment0 size = 1
Created uint32 vector observation_environment0 size = 49152
Created float32 vector test reward_environment0 size = 1
Created int32 vector done_environment0 size = 1


2024-09-27 21:50:06.887648: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-09-27 21:50:06.887675: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: kimbring2-ROG-Strix-GA35DX-G35DX
2024-09-27 21:50:06.887681: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: kimbring2-ROG-Strix-GA35DX-G35DX
2024-09-27 21:50:06.887732: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.183.1
2024-09-27 21:50:06.887752: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 535.183.1
2024-09-27 21:50:06.887756: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 535.183.1


episode: 0/2000000, score: 0.0, average: 0.0 SAVING
episode: 1/2000000, score: 4.0, average: 2.0 SAVING
episode: 2/2000000, score: 2.0, average: 2.0 SAVING
episode: 3/2000000, score: 0.0, average: 1.5 
episode: 4/2000000, score: 3.0, average: 1.8 
episode: 5/2000000, score: 2.0, average: 1.8333333333333333 
episode: 6/2000000, score: 2.0, average: 1.8571428571428572 
episode: 7/2000000, score: 2.0, average: 1.875 
episode: 8/2000000, score: 1.0, average: 1.7777777777777777 
episode: 9/2000000, score: 2.0, average: 1.8 
episode: 10/2000000, score: 1.0, average: 1.7272727272727273 
episode: 11/2000000, score: 3.0, average: 1.8333333333333333 
episode: 12/2000000, score: 2.0, average: 1.8461538461538463 
episode: 13/2000000, score: 2.0, average: 1.8571428571428572 
episode: 14/2000000, score: 2.0, average: 1.8666666666666667 
episode: 15/2000000, score: 2.0, average: 1.875 
episode: 16/2000000, score: 3.0, average: 1.9411764705882353 
episode: 17/2000000, score: 1.0, average: 1.8888888888

KeyboardInterrupt: 