In [1]:
import gym
from gym import wrappers
import time
import os
os.add_dll_directory('C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.5/bin')
import numpy as np
import pickle
from joblib import Parallel, delayed

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Activation, Dropout
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger

# If you have more than 1 GPU, you might want to specify which GPU for training.
# In this case, I have 2 GPU and the second one is RTX 2080ti, so I pick the `second` one.
os.environ['CUDA_VISIBLE_DEVICES']='0' # The second
#tf.config.set_soft_device_placement(True)
gpus = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("GPU is", "available" if tf.config.list_physical_devices("GPU") else "NOT AVAILABLE")

Version:  2.7.0
Eager mode:  True
GPU is available


In [2]:
folder_name = '20211215_BipedalWalker-v3_SAC'
env_name = 'BipedalWalker-v3'

In [3]:
n_action = 4
n_state_var = 24
gamma = 0.996
lr = 1e-3
dr = 0.1 # dropout rate
wv_dim = 64
ff_dim = 2048

In [4]:
env = gym.make(env_name)
env = wrappers.Monitor(env, "./gym-results", force=True)
observation = env.reset()



## Test environment

In [5]:
# The action domain
print('-- Action ---')
print(env.action_space)
print('')
# The observing domain
print('--- Observation ---')
print(env.observation_space)
env.close()

-- Action ---
Box([-1. -1. -1. -1.], [1. 1. 1. 1.], (4,), float32)

--- Observation ---
Box([-inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf
 -inf -inf -inf -inf -inf -inf -inf -inf -inf -inf], [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf
 inf inf inf inf inf inf], (24,), float32)


In [6]:
action = env.action_space.sample()
env.step(action)

(array([-0.0033001 , -0.01897045, -0.0091202 ,  0.02070081, -0.29741445,
        -0.6126805 ,  1.4828402 ,  0.99364257,  1.        ,  0.32402867,
         0.85520095,  0.1335144 , -0.999922  ,  1.        ,  0.45091254,
         0.45603332,  0.47199342,  0.5007652 ,  0.54633844,  0.6162627 ,
         0.72539467,  0.90622747,  1.        ,  1.        ], dtype=float32),
 -0.04889316515003879,
 False,
 {})

In [7]:
st = time.time()
n_trial = 1
n_action = 1000
# Initialize a New Env
for _ in range(n_trial):
    if env:
        env.close()
    env = gym.make(env_name)
    #env = wrappers.Monitor(env, "./gym-results", force=True)
    state = env.reset()
    for _ in range(n_action):
        action = env.action_space.sample()
        state, reward, done, info = env.step(action)
        if done: break
env.close()

# Make models

In [8]:
# The kernel of value function and the actor( a.k.a. prediction function)
def getKernel():
    _input = Input((n_state_var,))
    m = Dense(wv_dim)(_input)
    for i in range(1):
        tmp = Dense(ff_dim)(m)
        tmp = Activation('relu')(tmp)
        tmp = Dense(wv_dim)(tmp)
        m = BatchNormalization(epsilon=1e-6)(tmp+m)
        m = Activation('relu')(m)
    model = Model(
        _input,
        m,
        name = 'kernel',
    ) 
    return model

# Used to estimate winning percentage
# s -> v
def getValueFunction(kernal):
    s_input = Input((n_state_var,))
    m = kernel(s_input)
    m = Dropout(dr)(m)
    reward = Dense(1)(m)
    model = Model(
        s_input,
        reward,
        name = 'value_function'
    )
    return model

# Used to modeling the actions
# s -> a
def getActor(kernel):
    s_input = Input((n_state_var,))
    m = kernel(s_input)
    output = Dense(n_action)(m)
    output = Activation('sigmoid')(output)
    model = Model(
        s_input,
        output,
        name = 'actor',
    ) 
    return model

# Used to modeling immediate rewards
# s,a -> r
def getRewardFunction():
    s_input = Input((n_state_var,))
    s_emb = Dense(wv_dim)(s_input)
    a_input = Input((n_action))
    a_emb = Dense(wv_dim)(a_input)
    m = BatchNormalization(epsilon=1e-6)(a_emb+s_emb)
    for i in range(2):
        tmp = Dense(ff_dim)(m)
        tmp = Activation('relu')(tmp)
        tmp = Dense(wv_dim)(tmp)
        m = BatchNormalization(epsilon=1e-6)(m+tmp)
        m = Activation('relu')(m)
    m = Dropout(dr)(m)
    output = Dense(1)(m)
    model = Model(
        [s_input, a_input],
        output,
        name = 'reward_function',
    )
    return model
    

In [9]:
kernel = getKernel()
actor  = getActor(kernel)
# Not to share the kernel
kernel = getKernel()
v_func = getValueFunction(kernel)
r_func = getRewardFunction()
v_func.summary()
r_func.summary()
actor.summary()

Model: "value_function"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 24)]              0         
                                                                 
 kernel (Functional)         (None, 64)                266112    
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                 65        
                                                                 
Total params: 266,177
Trainable params: 266,049
Non-trainable params: 128
_________________________________________________________________
Model: "reward_function"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape        

In [10]:
def q_value(r_arr, gamma = gamma): # 0.996
    #print(r_arr)
    decay_arr = np.array([gamma**i for i in range(len(r_arr)+1)])
    #print(decay_arr)
    q_arr = [sum(r_arr[i:] * decay_arr[:-i-1]) for i in range(len(r_arr))]
    return q_arr

# Reinforcement Learning

## Value function

In [11]:
v_func_optimizer = tf.keras.optimizers.Adam(learning_rate = lr)
v_loss_object = tf.keras.losses.MeanSquaredError(reduction = 'none')
entropy = tf.keras.losses.BinaryCrossentropy(reduction = 'none')

v_func.compile(
    loss='mean_squared_error',
    optimizer=Adam(),
    metrics=['mean_squared_error'],
)

# The loss for policy gradient with partition function
def v_func_loss(pred, policy_a, r):
    loss = 0.5 * (pred - r - entropy())**2
    return tf.reduce_mean(loss)

@tf.function()
def train_v_step(s_in, policy_a, r):
    pred = None
    loss = None
    with tf.GradientTape() as tape:
        pred = v_func(s_in)
        loss = v_func_loss(pred, policy_a, r)
    gradients = tape.gradient(loss, v_func.trainable_variables)    
    v_func_optimizer.apply_gradients(zip(gradients, v_func.trainable_variables))    
    return loss

## Reward function

In [12]:
r_func_optimizer = tf.keras.optimizers.Adam(learning_rate = lr)
r_loss_object = tf.keras.losses.MeanSquaredError(reduction = 'none')

r_func.compile(
    loss = 'mean_squared_error',
    optimizer=Adam(),
    metrics=['mean_squared_error'],
)

# The loss for policy gradient with partition function
def r_func_loss(pred, v1, reward):
    loss = 0.5 * (pred - gamma*v1 - reward)**2
    return tf.reduce_mean(loss)

@tf.function()
def train_r_step(s_in, a_in, v1, reward):
    pred = None
    loss = None
    with tf.GradientTape() as tape:
        pred = r_func([s_in, a_in])
        loss = r_func_loss(pred, v1, reward)
    gradients = tape.gradient(loss, r_func.trainable_variables)    
    r_func_optimizer.apply_gradients(zip(gradients, r_func.trainable_variables))    
    return loss

## Actor(Predition function) 

In [13]:
actor_optimizer = tf.keras.optimizers.Adam(learning_rate = lr)
loss_object = tf.keras.losses.BinaryCrossentropy(reduction = 'none')

actor.compile(
    loss='binary_crossentropy',
    optimizer=Adam(),
    metrics=['binary_crossentropy'],
)

# The loss for policy gradient with partition function
def actor_loss(real, pred, r, beta = 1.0):
    loss = loss_object(real, pred)
    loss = loss * r - beta * entropy(pred, pred)
    return tf.reduce_mean(loss)

@tf.function()
def train_a_step(s_in, real, r):
    pred = None
    loss = None
    with tf.GradientTape() as tape:
        pred = actor(s_in)
        loss = actor_loss(real, pred, r)
    gradients = tape.gradient(loss, actor.trainable_variables)    
    actor_optimizer.apply_gradients(zip(gradients, actor.trainable_variables))    
    return loss

## Replay buffer

In [14]:
class replay_buffer():
    def __init__(self, n_replay_buffer):
        self.n_replay_buffer = n_replay_buffer
        self.holder = []
        return
    # Put new trajectories into the replay buffer
    def stack(self, _input):
        self.holder.extend(list(_input))
        self.holder = self.holder[-self.n_replay_buffer:]
        return
    # Take some of trajectories from the replay buffer
    def sample(self, q_len, idx = None):
        if q_len > len(self.holder):
            return self.holder
        elif isinstance(idx, type(None)):
            idx = np.arange(self.n_replay_buffer)
            np.random.shuffle(idx)
            return [self.holder[i] for i in idx[:q_len]]
        else:
            return [self.holder[i] for i in idx[:q_len]]

In [15]:
example_rb = replay_buffer(16)

In [16]:
example_rb.stack(np.ones((3,3)))
example_rb.holder

[array([1., 1., 1.]), array([1., 1., 1.]), array([1., 1., 1.])]

# Entire training progress

In [17]:
# Constants and Initialization
n_batch = 1000
n_trial = 32
n_step = 1000
n_replay_buffer = 102400
play_size = 10240

# List for recording losses
v_loss_list = []
a_loss_list = []
r_loss_list = []
score_list = []
best_score = None

# Replay buffer
s_replay_buffer = replay_buffer(n_replay_buffer)
s1_replay_buffer = replay_buffer(n_replay_buffer)
a_replay_buffer = replay_buffer(n_replay_buffer)
r_replay_buffer = replay_buffer(n_replay_buffer)

In [18]:
def helper(idx):
    env = gym.make(env_name)
    #env = wrappers.Monitor(env, "./gym-results", force=True)
    state = env.reset()
    state_list = []
    state_list.append(state)
    s1_list = []
    action_list = []
    reward_list = []
    kernel = getKernel()
    actor  = getActor(kernel)
    try:
        actor.load_weights(f'{folder_name}/actor_current.h5')
    except:
        pass
    #-----------------------------------
    # The Game Section
    for i in range(n_step):
        action = actor(np.array([state])).numpy()
        action = action[0]*2 - 1
        state, reward, done, info = env.step(action)
        s1_list.append(state)
        action_list.append(action)
        #-------------------------------
        # Not to use -100 reward
        if reward == -100:
            reward_list.append(-0.03)
        else:
            reward_list.append(reward)
        #-------------------------------
        if done:
            break
        # Don't save the last env if the game is not done at the end
        elif i == n_step -1:
            break
        else:
            state_list.append(state)
    env.close()
    return state_list, s1_list, action_list, reward_list

In [19]:
env.close()
st = time.time()
for b in range(n_batch):
    # Clear Previous Behaviors
    state_array = []
    s1_array = []
    reward_array = []
    action_array = []
    # The model for parallel
    actor.save(f'{folder_name}/actor_current.h5')
    # Multiprocessing
    #for idx in range(n_trial):
    #    state_list, s1_list, action_list, reward_list = helper(idx)
    #    state_array.extend(state_list)
    #    s1_array.extend(s1_list)
    #    action_array.extend(action_list)
    #    reward_array.extend(reward_list)
    state_holder, s1_holder, action_holder, reward_holder = zip(*Parallel(n_jobs = 8)(delayed(helper)(i) for i in range(n_trial)))
    #-----------------------------------
    # Trajectory holder
    for i in range(n_trial):
        state_array.extend(state_holder[i])
        s1_array.extend(s1_holder[i])
        action_array.extend(action_holder[i])
        reward_array.extend(reward_holder[i])
    score = np.mean([r[-1] for r in reward_holder])
    #score = np.mean(reward_array)
    score_list.append(score)
    if isinstance(best_score, type(None)) or (score > best_score):
        best_score = score
        v_func.save(f'{folder_name}/v_func.h5')
        r_func.save(f'{folder_name}/r_func.h5')
        actor.save(f'{folder_name}/actor.h5')
        print('Model saved')
    
    #---------------------------------
    # Save to replay buffer
    s_replay_buffer.stack( state_array)
    s1_replay_buffer.stack(s1_array)
    a_replay_buffer.stack( action_array)
    r_replay_buffer.stack( reward_array)
    
    buffer_idx = np.arange(len(s_replay_buffer.holder))
    np.random.shuffle(buffer_idx)
    
    state_array = np.array(s_replay_buffer.sample(play_size, buffer_idx), dtype = np.float32)
    s1_array = np.array(s1_replay_buffer.sample(play_size, buffer_idx), dtype = np.float32)
    action_array = np.array(a_replay_buffer.sample(play_size, buffer_idx), dtype = np.float32)
    reward_array = np.array(r_replay_buffer.sample(play_size, buffer_idx), dtype = np.float32)
    #---------------------------------
    # Update Value function
    fake_reward_array = r_func.predict([state_array, action_array])
    #p_action_array = actor.predict(state_array)
    #onehot_action_array = tf.one_hot(action_array, depth = n_action)
    #policy_array = tf.reduce_sum(p_action_array * onehot_action_array, axis = -1)
    policy_array = actor.predict(state_array)
    v_loss = train_v_step(state_array, policy_array, fake_reward_array)
    v_loss_list.append(v_loss)
    # Update Reward function
    v1_array = v_func.predict(s1_array)
    r_loss = train_r_step(state_array, action_array, v1_array, reward_array)
    r_loss_list.append(r_loss)
    # Update Actor
    a_loss = train_a_step(state_array, action_array, fake_reward_array)
    a_loss_list.append(a_loss)
    #---------------------------------
    # Report
    elapsed_time = time.time() - st
    st = time.time()
    print(
        f'Batch: {b+1}, '
        f'Score: {score:.4f}, '
        f'A loss: {a_loss:.4f}, '
        f'V loss: {v_loss:.4f}, '
        f'R loss: {r_loss:.4f}, '
        f'elapsed time: {elapsed_time:.0f} secs'
    )

Model saved
Batch: 1, Score: -0.0300, A loss: -0.4013, V loss: 1.0456, R loss: 1.3520, elapsed time: 8 secs
Batch: 2, Score: -0.0300, A loss: -1.3729, V loss: 0.4028, R loss: 0.0343, elapsed time: 6 secs
Batch: 3, Score: -0.0300, A loss: -1.9334, V loss: 1.5397, R loss: 0.2351, elapsed time: 7 secs
Batch: 4, Score: -0.0300, A loss: -1.8029, V loss: 0.7824, R loss: 0.1066, elapsed time: 9 secs
Batch: 5, Score: -0.0300, A loss: -1.5782, V loss: 0.1764, R loss: 0.1637, elapsed time: 9 secs
Batch: 6, Score: -0.0300, A loss: -1.4721, V loss: 0.0368, R loss: 0.2029, elapsed time: 8 secs
Batch: 7, Score: -0.1169, A loss: -1.4571, V loss: 0.0321, R loss: 0.1778, elapsed time: 16 secs
Batch: 8, Score: -0.0300, A loss: -1.4904, V loss: 0.0126, R loss: 0.1672, elapsed time: 9 secs
Batch: 9, Score: -0.0300, A loss: -1.6564, V loss: 0.0116, R loss: 0.1336, elapsed time: 8 secs
Batch: 10, Score: -0.0300, A loss: -1.8937, V loss: 0.0353, R loss: 0.1038, elapsed time: 11 secs
Batch: 11, Score: -0.0300

KeyboardInterrupt: 

# Test

In [None]:
actor.load_weights(f'{folder_name}/actor.h5')
v_func.load_weights(f'{folder_name}/v_func.h5')
r_func.load_weights(f'{folder_name}/r_func.h5')

In [None]:
env.close()
st = time.time()
n_trial = 1000
n_step = 1000
for i in range(n_trial):
    # Initialize a New Env
    if env:
        env.close()
    env = gym.make(env_name)
    env = wrappers.Monitor(env, "./gym-results", force=True)
    state = env.reset()
    for _ in range(n_step):
        action = actor(np.array([state])).numpy()
        action = action[0]*2 - 1
        state, reward, done, info = env.step(action)
        if done: break