In [None]:
import os
import random
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import cv2
import threading
from threading import Thread, Lock
import time
import tensorflow_probability as tfp
from typing import Any, List, Sequence, Tuple

from pongMultiplayer import pongMultiplayerEnv
from IPython.display import clear_output

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'


tfd = tfp.distributions


class OurModel(tf.keras.Model):
    def __init__(self, input_shape, action_space):
        super(OurModel, self).__init__()

        self.flatten = Flatten()
        
        #self.dense_0 = Dense(1024, activation='relu')
        self.conv_1 = tf.keras.layers.Conv2D(24, 7, 4, padding="valid", activation="relu")
        self.conv_2 = tf.keras.layers.Conv2D(48, 3, 2, padding="valid", activation="relu")
        self.conv_3 = tf.keras.layers.Conv2D(96, 2, 1, padding="valid", activation="relu")
        self.conv_4 = tf.keras.layers.Conv2D(512, 6, 1, padding="valid", activation="relu")
        
        self.dense_1 = Dense(action_space)
        self.dense_2 = Dense(1)

    def call(self, X_input):
        #X_input = self.flatten(X_input)
        #X_input = self.dense_0(X_input)

        conv_1 = self.conv_1(X_input)
        conv_2 = self.conv_2(conv_1)
        conv_3 = self.conv_3(conv_2)
        conv_4 = self.conv_4(conv_3)
        
        conv_4_flattened = Flatten()(conv_4)
        
        action_logit = self.dense_1(conv_4_flattened)
        value = self.dense_2(conv_4_flattened)

        return action_logit, value


def safe_log(x):
  return tf.where(tf.math.equal(x, 0), tf.zeros_like(x), tf.math.log(tf.math.maximum(1e-12, x)))


def take_vector_elements(vectors, indices):
    return tf.gather_nd(vectors, tf.stack([tf.range(tf.shape(vectors)[0]), indices], axis=1))


huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
sparse_ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
mse_loss = tf.keras.losses.MeanSquaredError()


class A3CAgent:
    def __init__(self, env_name):
        self.env_name = env_name
        peer_port = "9000"
        peer_type = "server"
        ip_address = "127.0.0.1"
        GODOT_BIN_PATH = "./multiplayer_pong/pong_multi.x86_64"
        env_abs_path = "./multiplayer_pong/pong_multi.pck"
        self.env = pongMultiplayerEnv(exec_path=GODOT_BIN_PATH, env_path=env_abs_path, peer_type=peer_type, ip_address=ip_address, 
                                      turbo_mode=True)
        self.action_size = 3
        self.EPISODES, self.episode, self.max_average = 2000000, 0, -21.0 # specific for pong
        self.lock = Lock()
        self.lr = 0.0001

        self.ROWS = 64
        self.COLS = 64
        self.REM_STEP = 4

        # Instantiate plot memory
        self.scores, self.episodes, self.average = [], [], []

        self.Save_Path = 'Models'
        self.state_size = (self.ROWS, self.COLS, self.REM_STEP)

        if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
        self.path = '{}_A3C_{}'.format(self.env_name, self.lr)
        self.model_name = os.path.join(self.Save_Path, self.path)

        # Create Actor-Critic network model
        self.ActorCritic = OurModel(input_shape=self.state_size, action_space=self.action_size)
        self.optimizer = tf.keras.optimizers.Adam(self.lr)
        self.writer = tf.summary.create_file_writer("tensorboard")

    def act(self, state):
        # Use the network to predict the next action to take, using the model
        prediction = self.ActorCritic(state, training=False)
        action = tf.random.categorical(prediction[0], 1).numpy()

        return action[0][0]

    def discount_rewards(self, rewards, next_state):
        # Compute the gamma-discounted rewards over an episode
        gamma = 0.99    # discount rate
        running_add = 0
        discounted_r = np.zeros_like(rewards)
        for i in reversed(range(0, len(rewards))):
            if rewards[i] != 0: # reset the sum, since this was a game boundary (pong specific!)
                running_add = 0

            running_add = running_add * gamma + rewards[i]
            discounted_r[i] = running_add

        if np.std(discounted_r) != 0.0:
            discounted_r -= np.mean(discounted_r) # normalizing the result
            discounted_r /= np.std(discounted_r) # divide by standard deviation

        return discounted_r

    def replay(self, states, actions, rewards, next_state):
        # reshape memory to appropriate shape for training
        states = np.vstack(states)

        # Compute discounted rewards
        discounted_r = self.discount_rewards(rewards, next_state)
        discounted_r_ = np.vstack(discounted_r)
        with tf.GradientTape() as tape:
            prediction = self.ActorCritic(states, training=True)
            action_logits = prediction[0]
            values = prediction[1]

            action_logits_selected = take_vector_elements(action_logits, actions)

            advantages = discounted_r - np.stack(values)[:, 0]

            action_logits_selected = tf.nn.softmax(action_logits_selected)
            action_logits_selected_probs = tf.math.log(action_logits_selected)

            action_logits_ = tf.nn.softmax(action_logits)
            #action_logits_ = tf.math.log(action_logits_)
            dist = tfd.Categorical(probs=action_logits_)
            action_log_prob = dist.prob(actions)
            action_log_prob = tf.math.log(action_log_prob)

            actor_loss = -tf.math.reduce_mean(action_logits_selected_probs * advantages)
            #actor_loss = tf.cast(actor_loss, 'float32')

            action_probs = tf.nn.softmax(action_logits)

            critic_loss_ = huber_loss(values, discounted_r)
            critic_loss = mse_loss(values, discounted_r_)
            critic_loss = tf.cast(critic_loss, 'float32')
            
            total_loss = actor_loss + critic_loss

        grads = tape.gradient(total_loss, self.ActorCritic.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.ActorCritic.trainable_variables))

    def load(self, model_name):
        self.ActorCritic = load_model(model_name, compile=False)
        #self.Critic = load_model(Critic_name, compile=False)

    def save(self):
        self.ActorCritic.save(self.model_name)
        #self.Critic.save(self.Model_name + '_Critic.h5')

    def PlotModel(self, score, episode):
        self.scores.append(score)
        self.episodes.append(episode)
        self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
        return self.average[-1]

    def imshow(self, image, rem_step=0):
        cv2.imshow(self.model_name + str(rem_step), image[rem_step,...])
        if cv2.waitKey(25) & 0xFF == ord("q"):
            cv2.destroyAllWindows()
            return

    def GetImage(self, frame, image_memory):
        if image_memory.shape == (1,*self.state_size):
            image_memory = np.squeeze(image_memory)

        # croping frame to 80x80 size
        frame_cropped = frame[35:195:2, ::2,:]
        if frame_cropped.shape[0] != self.COLS or frame_cropped.shape[1] != self.ROWS:
            # OpenCV resize function
            frame_cropped = cv2.resize(frame, (self.COLS, self.ROWS), interpolation=cv2.INTER_CUBIC)

        # converting to RGB (numpy way)
        frame_rgb = 0.299*frame_cropped[:,:,0] + 0.587*frame_cropped[:,:,1] + 0.114*frame_cropped[:,:,2]

        # convert everything to black and white (agent will train faster)
        frame_rgb[frame_rgb < 50] = 0
        frame_rgb[frame_rgb >= 150] = 255
        
        # dividing by 255 we expresses value to 0-1 representation
        new_frame = np.array(frame_rgb).astype(np.float32) / 255.0

        # push our data by 1 frame, similar as deq() function work
        image_memory = np.roll(image_memory, 1, axis=2)

        # inserting new frame to free space
        image_memory[:,:,0] = new_frame

        return np.expand_dims(image_memory, axis=0)

    def reset(self):
        image_memory = np.zeros(self.state_size)
        obs = self.env.reset()
        obs = np.reshape(obs, (128,128,3))
        obs = np.array(obs).astype(np.uint8)
        obs = cv2.resize(obs, dsize=(64, 64), interpolation=cv2.INTER_CUBIC)
        
        for i in range(self.REM_STEP):
            state = self.GetImage(obs, image_memory)

        return state

    def step(self, action, image_memory):
        next_obs, reward, done, info = self.env.step(action)
        next_obs = np.reshape(next_obs, (128,128,3))
        next_obs = np.array(next_obs).astype(np.uint8)
        next_obs = cv2.resize(next_obs, dsize=(64, 64), interpolation=cv2.INTER_CUBIC)
        next_state = self.GetImage(next_obs, image_memory)

        return next_state, reward[0], done[0], info

    def train(self):
        while self.episode < self.EPISODES:
            # Reset episode
            score, done, SAVING = 0, False, ''
            state = self.reset()

            states, actions, rewards = [], [], []
            #while not done:
            for step in range(0, 1000):
                action = agent.act(state)
                next_state, reward, done, _ = self.step(action, state)

                states.append(state)
                actions.append(action)
                rewards.append(reward)

                score += reward
                state = next_state

            self.replay(states, actions, rewards, next_state)
            states, actions, rewards = [], [], []

            average = self.PlotModel(score, self.episode)
            
            # saving best models
            if average >= self.max_average:
                self.max_average = average
                #self.save()
                SAVING = "SAVING"
            else:
                SAVING = ""

            #print("episode: {}/{}, score: {}, average: {} {}".format(self.episode, self.EPISODES, score, average, SAVING))
            with self.writer.as_default():
                tf.summary.scalar("server, average_reward", average, step=self.episode)
                self.writer.flush()
            
            if self.episode < self.EPISODES:
                self.episode += 1

        env.close()

    def test(self, Actor_name, Critic_name):
        self.load(Actor_name, Critic_name)
        for e in range(100):
            state = self.reset(self.env)
            done = False
            score = 0
            while not done:
                self.env.render()
                action = np.argmax(self.Actor.predict(state))
                state, reward, done, _ = self.step(action, self.env, state)
                score += reward
                if done:
                    print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
                    break

        self.env.close()


if __name__ == "__main__":
    env_name = 'PongDeterministic-v4'
    #env_name = 'Pong-v0'
    agent = A3CAgent(env_name)
    agent.train() # use as A3C

2024-09-29 01:03:31.162826: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-29 01:03:31.162867: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-29 01:03:31.162907: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-29 01:03:31.170860: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Created float32 vector test agent_action_serverserver size = 1
Created int32 vector env_action_serverserver size = 1
Created uint32 vector observation_serverserver size = 49152
Created float32 vector test reward_serverserver size = 1
Created int32 vector done_serverserver size = 1
Godot Engine v4.3.stable.custom_build.77dcf97d8 (2024-08-14 23:00:16 UTC) - https://godotengine.org


2024-09-29 01:03:33.966675: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-09-29 01:03:33.966702: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: kimbring2-ROG-Strix-GA35DX-G35DX
2024-09-29 01:03:33.966708: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: kimbring2-ROG-Strix-GA35DX-G35DX
2024-09-29 01:03:33.966796: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.183.1
2024-09-29 01:03:33.966819: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 535.183.1
2024-09-29 01:03:33.966823: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 535.183.1


OpenGL API 3.3.0 NVIDIA 535.183.01 - Compatibility - Using Device: NVIDIA - NVIDIA GeForce RTX 3090

Shared memory handle found:--handle:serverserver
Constructing semaphore sem_action_serverserver
Constructing semaphore sem_observation_serverserver
Unique id: 1
total_step: 0
check 1
check 2
check 3
check 4
check 5
check 6

total_step: 0
check 1
check 6

total_step: 1
check 1
check 2
check 3
check 4
check 5
check 6

total_step: 1
check 1
check 6

total_step: 2
check 1
check 2
check 3
check 4
check 5
check 6

total_step: 2
check 1
check 6

total_step: 3
check 1
check 2
check 3
check 4
check 5
check 6

total_step: 3
check 1
check 6

total_step: 4
check 1
check 2
check 3
check 4
check 5
check 6

total_step: 4
check 1
check 6

total_step: 5
check 1
check 2
check 3
check 4
check 5
check 6

total_step: 5
check 1
check 6

total_step: 6
check 1
check 2
check 3
check 4
check 5
check 6

total_step: 6
check 1
check 6

total_step: 7
check 1
check 2
check 3
check 4
check 5
check 6

total_step: 7
che