In [None]:
# Tutorial by www.pylessons.com
# Tutorial written for - Tensorflow 2.3.1

import os
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
import random
import gym
import pylab
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import cv2
import threading
from threading import Thread, Lock
import time
import tensorflow_probability as tfp
from typing import Any, List, Sequence, Tuple

tfd = tfp.distributions


class OurModel(tf.keras.Model):
    def __init__(self, input_shape, action_space):
        super(OurModel, self).__init__()
        
        self.dense_0 = Dense(128, activation='relu')
        self.dense_1 = Dense(action_space)
        self.dense_2 = Dense(1)
        
    def call(self, X_input):
        X_input = self.dense_0(X_input)
        action_logit = self.dense_1(X_input)
        value = self.dense_2(X_input)
        
        return action_logit, value


def safe_log(x):
  """Computes a safe logarithm which returns 0 if x is zero."""
  return tf.where(
      tf.math.equal(x, 0),
      tf.zeros_like(x),
      tf.math.log(tf.math.maximum(1e-12, x)))


def take_vector_elements(vectors, indices):
    """
    For a batch of vectors, take a single vector component
    out of each vector.
    Args:
      vectors: a [batch x dims] Tensor.
      indices: an int32 Tensor with `batch` entries.
    Returns:
      A Tensor with `batch` entries, one for each vector.
    """
    return tf.gather_nd(vectors, tf.stack([tf.range(tf.shape(vectors)[0]), indices], axis=1))


huber_loss = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)
sparse_ce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=tf.keras.losses.Reduction.SUM)
mse_loss = tf.keras.losses.MeanSquaredError()


class A3CAgent:
    # Actor-Critic Main Optimization Algorithm
    def __init__(self, env_name):
        # Initialization
        # Environment and PPO parameters
        self.env_name = env_name       
        self.env = gym.make(env_name)
        self.action_size = self.env.action_space.n
        self.EPISODES, self.episode, self.max_average = 20000, 0, -21.0 # specific for pong
        self.lock = Lock()
        self.lr = 0.000025

        self.ROWS = 80
        self.COLS = 80
        self.REM_STEP = 4

        # Instantiate plot memory
        self.scores, self.episodes, self.average = [], [], []

        self.Save_Path = 'Models'
        #self.state_size = (self.REM_STEP, self.ROWS, self.COLS)
        self.state_size = 4
        
        if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
        self.path = '{}_A3C_{}'.format(self.env_name, self.lr)
        self.model_name = os.path.join(self.Save_Path, self.path)

        # Create Actor-Critic network model
        self.ActorCritic = OurModel(input_shape=self.state_size, action_space=self.action_size)
        
        self.learning_rate = 0.001
        self.optimizer = tf.keras.optimizers.Adam(self.learning_rate)

    def act(self, state):
        # Use the network to predict the next action to take, using the model
        prediction = self.ActorCritic(state, training=False)
        
        action_prob = prediction[0]
        action_logit = tf.nn.softmax(action_prob)
        
        dist = tfd.Categorical(probs=action_logit)
        
        return dist.sample()

    def discount_rewards(self, reward):
        # Compute the gamma-discounted rewards over an episode
        gamma = 0.99    # discount rate
        running_add = 0
        discounted_r = np.zeros_like(reward)
        for i in reversed(range(0, len(reward))):
            running_add = running_add * gamma + reward[i]
            discounted_r[i] = running_add

        discounted_r -= np.mean(discounted_r) # normalizing the result
        discounted_r /= np.std(discounted_r) # divide by standard deviation

        return discounted_r

    def get_loss(self, states, actions, discounted_r):
        discounted_r_vstack = np.vstack(discounted_r)
        prediction = self.ActorCritic(states, training=True)
        policies = prediction[0]
        values = prediction[1]

        policies_softmax = tf.nn.softmax(policies)

        policies_selected = take_vector_elements(policies, actions)

        advantages = discounted_r - np.stack(values)[:, 0] 

        logits_selected = tf.nn.softmax(policies_selected)
        logits_selected_logs = tf.math.log(logits_selected)

        #print("logits_selected_logs: ", logits_selected_logs)
        #print("advantages.shape: ", advantages.shape)
        actor_loss = -tf.math.reduce_mean(logits_selected_logs * advantages)
        actor_loss = tf.cast(actor_loss, 'float32')

        #critic_loss_ = huber_loss(values, discounted_r)
        discounted_r = tf.cast(discounted_r, 'float32')

        #print("values: ", values)
        #print("discounted_r_vstack: ", discounted_r_vstack)
        #critic_loss = huber_loss(values, discounted_r_vstack)
        critic_loss = tf.reduce_mean(tf.square(values - discounted_r_vstack))

        #entropy_loss = -tf.math.reduce_sum(policies_softmax * tf.math.log(policies_softmax))
        #print("entropy_loss: ", entropy_loss)

        total_loss = actor_loss + 0.5 * critic_loss
            
        return total_loss
        
    def replay(self, states, actions, rewards):
        # reshape memory to appropriate shape for training
        states = tf.concat(states, 0)
        
        actions = tf.concat(actions, 0)
        actions = tf.cast(actions, 'int32')
        
        # Compute discounted rewards
        discounted_r = self.discount_rewards(rewards)
        
        divide_size = 2
        batch_size = states.shape[0]
        epoch_size = batch_size // divide_size
        remain_size = batch_size - epoch_size * divide_size
        #print("batch_size: ", batch_size)
        #print("epoch_size: ", epoch_size)
        #print("remain_size: ", remain_size)
        
        '''
        batch_size:  23
        epoch_size:  1
        remain_size:  7
        '''
        for e in range(0, epoch_size):
            #print("e: ", e)
            with tf.GradientTape() as tape:
                total_loss = self.get_loss(states[divide_size*e:divide_size*(e+1),:], 
                                           actions[divide_size*e:divide_size*(e+1)], 
                                           discounted_r[divide_size*e:divide_size*(e+1)])
                
            grads = tape.gradient(total_loss, self.ActorCritic.trainable_variables)
            self.optimizer.apply_gradients(zip(grads, self.ActorCritic.trainable_variables))
        
        if remain_size != 0:
            #print("remain_size: ", remain_size)
            with tf.GradientTape() as tape:
                total_loss = self.get_loss(states[divide_size*epoch_size:divide_size*epoch_size+remain_size,:], 
                                           actions[divide_size*epoch_size:divide_size*epoch_size+remain_size], 
                                           discounted_r[divide_size*epoch_size:divide_size*epoch_size+remain_size])
                
            grads = tape.gradient(total_loss, self.ActorCritic.trainable_variables)
            self.optimizer.apply_gradients(zip(grads, self.ActorCritic.trainable_variables))
            
        #print("total_loss: ", total_loss)
        #print("")
            
    def load(self, model_name):
        self.ActorCritic = load_model(model_name, compile=False)

    def save(self):
        self.ActorCritic.save(self.model_name)

    pylab.figure(figsize=(18, 9))
    def PlotModel(self, score, episode):
        self.scores.append(score)
        self.episodes.append(episode)
        self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
        if str(episode)[-2:] == "00":# much faster than episode % 100
            pylab.plot(self.episodes, self.scores, 'b')
            pylab.plot(self.episodes, self.average, 'r')
            pylab.ylabel('Score', fontsize=18)
            pylab.xlabel('Steps', fontsize=18)
            try:
                pylab.savefig(self.path+".png")
            except OSError:
                pass

        return self.average[-1]

    def imshow(self, image, rem_step=0):
        cv2.imshow(self.Model_name+str(rem_step), image[rem_step,...])
        if cv2.waitKey(25) & 0xFF == ord("q"):
            cv2.destroyAllWindows()
            return

    def reset(self, env):
        image_memory = np.zeros(self.state_size)
        state = env.reset()
            
        return state
    
    def step(self, action, env, image_memory):
        next_state, reward, done, info = env.step(int(action))
        
        return next_state, reward, done, info
    
    def train(self, n_threads):
        self.env.close()
        
        # Instantiate one environment per thread
        envs = [gym.make(self.env_name) for i in range(n_threads)]

        # Create threads
        threads = [threading.Thread(
                target=self.train_threading,
                daemon=True,
                args=(self,
                    envs[i],
                    i)) for i in range(n_threads)]

        for t in threads:
            time.sleep(2)
            t.start()
            
        for t in threads:
            time.sleep(10)
            t.join()
            
    def train_threading(self, agent, env, thread):
        while self.episode < self.EPISODES:
            # Reset episode
            score, done, SAVING = 0, False, ''
            state = self.reset(env)
            state = np.expand_dims(state, 0)

            states, actions, rewards = [], [], []
            while not done:
                action = agent.act(state)
                next_state, reward, done, _ = self.step(action, env, state)
                next_state = np.expand_dims(next_state, 0)

                states.append(state)
                actions.append(int(action))
                rewards.append(reward)
                
                score += reward
                state = next_state
            
            self.lock.acquire()
            self.replay(states, actions, rewards)
            self.lock.release()

            states, actions, rewards = [], [], []
            
            # Update episode count
            with self.lock:
                average = self.PlotModel(score, self.episode)
                # saving best models
                if average >= self.max_average:
                    self.max_average = average
                    #self.save()
                    SAVING = "SAVING"
                else:
                    SAVING = ""

                print("episode: {}/{}, thread: {}, score: {}, average: {:.2f} {}".format(self.episode, self.EPISODES, thread, score, average, SAVING))
                if(self.episode < self.EPISODES):
                    self.episode += 1

        env.close()            

    def test(self, Actor_name, Critic_name):
        self.load(Actor_name, Critic_name)
        for e in range(100):
            state = self.reset(self.env)
            done = False
            score = 0
            while not done:
                self.env.render()
                action = np.argmax(self.Actor.predict(state))
                state, reward, done, _ = self.step(action, self.env, state)
                score += reward
                if done:
                    print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
                    break

        self.env.close()


if __name__ == "__main__":
    env_name = 'CartPole-v0'
    #env_name = 'Pong-v0'
    agent = A3CAgent(env_name)
    
    #agent.run() # use as A2C
    agent.train(n_threads=1) # use as A3C
    #agent.test('Models/Pong-v0_A3C_2.5e-05_Actor.h5', '')

2021-10-23 07:34:19.365694: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-10-23 07:34:19.365741: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: kimbring2-GF75-Thin-10UEK
2021-10-23 07:34:19.365746: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: kimbring2-GF75-Thin-10UEK
2021-10-23 07:34:19.365816: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 460.91.3
2021-10-23 07:34:19.365833: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 460.91.3
2021-10-23 07:34:19.365838: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 460.91.3
2021-10-23 07:34:19.366686: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use 

episode: 0/20000, thread: 0, score: 32.0, average: 32.00 SAVING
episode: 1/20000, thread: 0, score: 13.0, average: 22.50 
episode: 2/20000, thread: 0, score: 15.0, average: 20.00 
episode: 3/20000, thread: 0, score: 53.0, average: 28.25 
episode: 4/20000, thread: 0, score: 11.0, average: 24.80 
episode: 5/20000, thread: 0, score: 20.0, average: 24.00 
episode: 6/20000, thread: 0, score: 32.0, average: 25.14 
episode: 7/20000, thread: 0, score: 22.0, average: 24.75 
episode: 8/20000, thread: 0, score: 37.0, average: 26.11 
episode: 9/20000, thread: 0, score: 21.0, average: 25.60 
episode: 10/20000, thread: 0, score: 43.0, average: 27.18 
episode: 11/20000, thread: 0, score: 17.0, average: 26.33 
episode: 12/20000, thread: 0, score: 40.0, average: 27.38 
episode: 13/20000, thread: 0, score: 17.0, average: 26.64 
episode: 14/20000, thread: 0, score: 12.0, average: 25.67 
episode: 15/20000, thread: 0, score: 30.0, average: 25.94 
episode: 16/20000, thread: 0, score: 25.0, average: 25.88 
e

episode: 135/20000, thread: 0, score: 167.0, average: 74.52 
episode: 136/20000, thread: 0, score: 105.0, average: 72.62 
episode: 137/20000, thread: 0, score: 192.0, average: 75.00 
episode: 138/20000, thread: 0, score: 92.0, average: 73.42 
episode: 139/20000, thread: 0, score: 185.0, average: 75.12 
episode: 140/20000, thread: 0, score: 28.0, average: 71.68 
episode: 141/20000, thread: 0, score: 83.0, average: 70.28 
episode: 142/20000, thread: 0, score: 144.0, average: 72.48 
episode: 143/20000, thread: 0, score: 45.0, average: 71.94 
episode: 144/20000, thread: 0, score: 200.0, average: 75.10 
episode: 145/20000, thread: 0, score: 91.0, average: 76.70 
episode: 146/20000, thread: 0, score: 36.0, average: 76.86 
episode: 147/20000, thread: 0, score: 200.0, average: 79.80 
episode: 148/20000, thread: 0, score: 166.0, average: 81.82 SAVING
episode: 149/20000, thread: 0, score: 35.0, average: 81.60 
episode: 150/20000, thread: 0, score: 111.0, average: 83.12 SAVING
episode: 151/20000,