In [2]:
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

tfd = tfp.distributions

def take_vector_elements(vectors, indices):
    return tf.gather_nd(vectors, tf.stack([tf.range(tf.shape(vectors)[0]), indices], axis=1))

policy_array = np.array([0.523077488, 0.476922572])

dist = tfd.Categorical(logits=policy_array)
action = int(dist.sample())

print("action: ", action)

logit_array = tf.nn.softmax(policy_array)

selected_logit = logit_array[action]
print("selected_logit: ", selected_logit)
selected_log_prob = tf.math.log(selected_logit)
print("selected_log_prob: ", selected_log_prob)

log_probs = dist.log_prob(action)
print("log_probs: ", log_probs)

#target_action_probs = tf.nn.softmax(target_action_probs)
#target_action_log_probs = tf.math.log(target_action_probs)

action:  1
selected_logit:  tf.Tensor(0.48846331895377915, shape=(), dtype=float64)
selected_log_prob:  tf.Tensor(-0.7164908994613548, shape=(), dtype=float64)
log_probs:  tf.Tensor(-0.7164909, shape=(), dtype=float32)


In [18]:
import random

memory = deque(maxlen=100)
for i in range(0, 100):
    memory.append(i)
    
batch_size = 4
random.sample(memory, batch_size)
#random.sample(memory, batch_size)
#memory[6:6+batch_size]

[86, 38, 77, 69]

In [None]:
import os
import random
import gym
import pylab
import numpy as np
from collections import deque
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Input, Dense, Lambda, Add, Conv2D, Flatten, Reshape, LSTM
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras import backend as K
import tensorflow_probability as tfp
import cv2
import vtrace

tfd = tfp.distributions

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'


class OurModel(tf.keras.Model):
    def __init__(self, input_shape, action_space):
        super(OurModel, self).__init__()
        
        self.dense_0 = Dense(128, activation='relu', kernel_initializer='lecun_normal')
        self.dense_1 = Dense(action_space, kernel_initializer='lecun_normal')
        self.dense_2 = Dense(1, kernel_initializer='lecun_normal')
        
    def call(self, X_input):
        X_input = self.dense_0(X_input)
        action_prob = self.dense_1(X_input)
        value = self.dense_2(X_input)
        
        return action_prob, value


def safe_log(x):
  return tf.where(
      tf.math.equal(x, 0),
      tf.zeros_like(x),
      tf.math.log(tf.math.maximum(1e-12, x)))


def take_vector_elements(vectors, indices):
    return tf.gather_nd(vectors, tf.stack([tf.range(tf.shape(vectors)[0]), indices], axis=1))
    

class DQNAgent:
    def __init__(self, env_name):    
        self.env = gym.make(env_name)
        self.env_name = env_name       
        self.action_size = self.env.action_space.n
        self.EPISODES = 5000000
        
        # Instantiate memory
        memory_size = 5000
        self.memory = []

        self.batch_size = 16

        self.Save_Path = 'Models'
        if not os.path.exists(self.Save_Path): os.makedirs(self.Save_Path)
        self.scores, self.episodes, self.average = [], [], []

        self.Model_name = os.path.join(self.Save_Path, self.env_name + "_IMPALA.h5")

        self.COLS = 4
        
        self.state_size = (self.COLS)
        self.image_memory = np.zeros(self.state_size)
        
        self.model = self.make_model('Model', self.state_size, self.action_size)
        
        #self.model.summary()
        self.learning_rate = 0.001
        self.optimizer = tf.keras.optimizers.Adam(self.learning_rate)
        
    def remember(self, state, action, policy, reward, next_state, done):
        experience = state, action, policy, reward, next_state, done
        self.memory.append((experience))

    def act(self, state):
        prediction = self.model(state, training=False)
        
        dist = tfd.Categorical(logits=prediction[0])
        action = int(dist.sample()[0])
        policy = prediction[0]
        
        return action, policy
    
    def make_model(self, name, input_shape, action_space):
        state = tf.keras.Input(shape=input_shape)
        head = OurModel(input_shape, action_space)(state)
        model = tf.keras.Model(inputs=state, outputs=head, name=name)
        
        return model
    
    #@tf.function
    def update(self, states, actions, agent_policies, rewards, next_states, dones):
        online_variables = self.model.trainable_variables
        with tf.GradientTape() as tape:
            tape.watch(online_variables)
            
            learner_outputs = self.model(states, training=True)
            
            agent_logits = tf.nn.softmax(agent_policies[:-1])
            actions = actions[:-1]
            rewards = rewards[1:]
            dones = dones[1:]
        
            learner_policies = learner_outputs[0]
            learner_logits = tf.nn.softmax(learner_policies[:-1])
            
            learner_values = learner_outputs[1]
            learner_values = tf.squeeze(learner_values)
            
            bootstrap_value = learner_values[-1]
            learner_values = learner_values[:-1]
            
            discounting = 0.99
            discounts = tf.cast(~dones, tf.float32) * discounting
            
            actions = tf.convert_to_tensor(actions, dtype=tf.int32)
            
            tf.print("learner_logits.shape: ", learner_logits.shape)
            tf.print("agent_logits.shape: ", agent_logits.shape)
            
            target_action_probs = take_vector_elements(learner_logits, actions)
            target_action_log_probs = tf.math.log(target_action_probs)
            
            behaviour_action_probs = take_vector_elements(agent_logits, actions)
            behaviour_action_log_probs = tf.math.log(behaviour_action_probs)
            
            lambda_ = 1.0
            
            log_rhos = target_action_log_probs - behaviour_action_log_probs
            
            log_rhos = tf.convert_to_tensor(log_rhos, dtype=tf.float32)
            discounts = tf.convert_to_tensor(discounts, dtype=tf.float32)
            rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
            values = tf.convert_to_tensor(learner_values, dtype=tf.float32)
            bootstrap_value = tf.convert_to_tensor(bootstrap_value, dtype=tf.float32)
            
            clip_rho_threshold = tf.convert_to_tensor(1.0, dtype=tf.float32)
            clip_pg_rho_threshold = tf.convert_to_tensor(1.0, dtype=tf.float32)
            
            rhos = tf.math.exp(log_rhos)
            
            clipped_rhos = tf.minimum(clip_rho_threshold, rhos, name='clipped_rhos')
            
            cs = tf.minimum(1.0, rhos, name='cs')
            cs *= tf.convert_to_tensor(lambda_, dtype=tf.float32)

            values_t_plus_1 = tf.concat([values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
            deltas = clipped_rhos * (rewards + discounts * values_t_plus_1 - values)
        
            #deltas -= np.mean(deltas) # normalizing the result
            #deltas /= np.std(deltas) # divide by standard deviation
        
            acc = tf.zeros_like(bootstrap_value)
            vs_minus_v_xs = []
            for i in range(int(discounts.shape[0]) - 1, -1, -1):
                discount, c, delta = discounts[i], cs[i], deltas[i]
                acc = delta + discount * c * acc
                vs_minus_v_xs.append(acc)  
            
            vs_minus_v_xs = vs_minus_v_xs[::-1]
            
            vs = tf.add(vs_minus_v_xs, values, name='vs')
            vs_t_plus_1 = tf.concat([vs[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
            clipped_pg_rhos = tf.minimum(clip_pg_rho_threshold, rhos, name='clipped_pg_rhos')
            
            pg_advantages = (clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values))
            
            vs = tf.stop_gradient(vs)
            pg_advantages = tf.stop_gradient(pg_advantages)
            
            actor_loss = -tf.reduce_mean(target_action_log_probs * pg_advantages)
            
            baseline_cost = 0.5
            v_error = values - vs
            critic_loss = baseline_cost * 0.5 * tf.reduce_mean(tf.square(v_error))
            
            total_loss = actor_loss + critic_loss

        grads = tape.gradient(total_loss, self.model.trainable_variables)
        self.optimizer.apply_gradients(zip(grads, self.model.trainable_variables))
    
    def replay(self):
        memory_len = len(self.memory)
        if len(self.memory) > self.batch_size:
            start_index = random.randint(0, memory_len - self.batch_size)
            minibatch = self.memory[start_index:start_index+self.batch_size]
        else:
            return

        state = np.zeros((self.batch_size, self.state_size), dtype=np.float32)
        action = np.zeros(self.batch_size, dtype=np.int32)
        policy = np.zeros((self.batch_size, self.action_size), dtype=np.float32)
        reward = np.zeros(self.batch_size, dtype=np.float32)
        next_state = np.zeros((self.batch_size, self.state_size), dtype=np.float32)
        done = np.zeros(self.batch_size, dtype=np.bool)
      
        for i in range(len(minibatch)):
            state[i], action[i], policy[i], reward[i], next_state[i], done[i] = minibatch[i]

        self.update(state, action, policy, reward, next_state, done)
        
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

    pylab.figure(figsize=(18, 9))
    def PlotModel(self, score, episode):
        self.scores.append(score)
        self.episodes.append(episode)
        self.average.append(sum(self.scores[-50:]) / len(self.scores[-50:]))
        pylab.plot(self.episodes, self.average, 'r')
        pylab.plot(self.episodes, self.scores, 'b')
        pylab.ylabel('Score', fontsize=18)
        pylab.xlabel('Games', fontsize=18)
        try:
            pylab.savefig(self.env_name + "_IMAPLA.png")
        except OSError:
            pass

        # no need to worry about model, when doing a lot of experiments
        self.Model_name = os.path.join(self.Save_Path, self.env_name + "_IMAPLA.h5")

        return self.average[-1]

    def imshow(self, image, rem_step=0):
        cv2.imshow("cartpole" + str(rem_step), image[:,:,rem_step])
        if cv2.waitKey(25) & 0xFF == ord("q"):
            cv2.destroyAllWindows()
        
    def reset(self):
        state = self.env.reset()

        return state

    def step(self,action):
        next_state, reward, done, info = self.env.step(action)
        
        return next_state, reward, done, info
    
    def run(self):
        max_average = 190.0
        total_step = 0
        for e in range(self.EPISODES):
            state = self.reset()
            state = np.expand_dims(state, 0)
            
            done = False
            score = 0
            SAVING = ''
            while not done:
                #self.env.render()
                action, policy = self.act(state)
                
                next_state, reward, done, _ = self.step(action)
                next_state = np.expand_dims(next_state, 0)
                
                self.remember(state, action, policy, reward / 200.0, next_state, done)
                state = next_state
                score += reward

                if done:
                    # every episode, plot the result
                    average = self.PlotModel(score, e)

                    # saving best models
                    if average >= max_average:
                        max_average = average
                        self.save(self.Model_name)
                        SAVING = "SAVING"
                    else:
                        SAVING = ""

                    self.save(self.Model_name)
                    print("episode: {}/{}, score: {}, average: {:.2f} {}".format(e, self.EPISODES, score, average, SAVING))
                    
                    break
                
                if total_step % 10 == 0:
                    # train model
                    self.replay()
                    
                total_step += 1
                    
        # close environemnt when finish training
        self.env.close()

    def test(self, Model_name):
        self.load(Model_name)
        for e in range(self.EPISODES):
            state = self.reset()
            done = False
            score = 0
            while not done:
                self.env.render()
                action = np.argmax(self.model.predict(state))
                state, reward, done, _ = self.step(action)
                score += reward
                if done:
                    print("episode: {}/{}, score: {}".format(e, self.EPISODES, score))
                    break
                    
        self.env.close()

        
if __name__ == "__main__":
    env_name = 'CartPole-v0'
    agent = DQNAgent(env_name)
    agent.run()
    #agent.test('Models/Pong-v0_DDQN_CNN.h5')

episode: 0/5000000, score: 12.0, average: 12.00 
learner_logits.shape:  TensorShape([15, 2])
agent_logits.shape:  TensorShape([15, 2])
episode: 1/5000000, score: 16.0, average: 14.00 
learner_logits.shape:  TensorShape([15, 2])
agent_logits.shape:  TensorShape([15, 2])
learner_logits.shape:  TensorShape([15, 2])
agent_logits.shape:  TensorShape([15, 2])
learner_logits.shape:  TensorShape([15, 2])
agent_logits.shape:  TensorShape([15, 2])
episode: 2/5000000, score: 34.0, average: 20.67 
learner_logits.shape:  TensorShape([15, 2])
agent_logits.shape:  TensorShape([15, 2])
learner_logits.shape:  TensorShape([15, 2])
agent_logits.shape:  TensorShape([15, 2])
learner_logits.shape:  TensorShape([15, 2])
agent_logits.shape:  TensorShape([15, 2])
episode: 3/5000000, score: 28.0, average: 22.50 
learner_logits.shape:  TensorShape([15, 2])
agent_logits.shape:  TensorShape([15, 2])
episode: 4/5000000, score: 11.0, average: 20.20 
learner_logits.shape:  TensorShape([15, 2])
agent_logits.shape:  Te