In [1]:
import numpy as np
import random
import os
import copy

from collections import deque
import gym

In [2]:
import tensorflow as tf

from tensorflow.keras.models import Sequential, clone_model
from tensorflow.keras.layers import Dense, Activation, LeakyReLU, BatchNormalization, Flatten
from tensorflow.keras.optimizers import Adam

os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT'] = '4' #if not hvd_utils.is_using_hvd() else str(hvd.size())

from rl.agents.dqn import DQNAgent

2022-10-02 16:22:01.424338: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def set_gpu(gpu_ids_list):
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            gpus_used = [gpus[i] for i in gpu_ids_list]
            tf.config.set_visible_devices(gpus_used, 'GPU')
            for gpu in gpus_used:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
        except RuntimeError as e:
            # Visible devices must be set before GPUs have been initialized
            print(e)


set_gpu([0])

Metal device set to: AMD Radeon Pro 555
1 Physical GPUs, 1 Logical GPU


2022-10-02 16:22:15.827054: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-02 16:22:15.827668: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-10-02 16:22:15.827730: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-02 16:22:15.827898: I tensorflow/core/common_runtime/pluggable_device/plug

In [4]:
env_name = "othello:othello-v0"
env = gym.make(env_name, render_mode="human")
env.observation_space['state']

Box(0.0, 64.0, (64,), float32)

In [5]:
# 4 observations
num_observations = env.observation_space['state'].shape[0]
num_actions = env.action_space.n
print(num_observations, num_actions)

64 64


In [6]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, input_shape=(1, num_observations), activation="relu"),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(64),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LeakyReLU(),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dense(64),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.LeakyReLU(),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(num_actions, activation=tf.keras.activations.linear)
])
print(model.summary())

Instructions for updating:
Colocations handled automatically by placer.
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 1, 32)             2080      
                                                                 
 dense_1 (Dense)             (None, 1, 32)             1056      
                                                                 
 dense_2 (Dense)             (None, 1, 64)             2112      
                                                                 
 batch_normalization (BatchN  (None, 1, 64)            256       
 ormalization)                                                   
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 1, 64)             0         
                                                                 
 dense_3 (Dense)             (None, 1, 128)       

In [7]:
target_model = clone_model(model)

In [8]:
EPOCHS = 500
BATCH_SIZE = 128
epsilon = 1.0
EPSILON_REDUCE = 0.995
LEARNING_RATE = 0.001
GAMMA = 0.95

In [9]:
def epsilon_greedy_action_selection(model, epsilon, observation, possible_actions):
    
    # set the mask
    mask = np.array([[True] * 64], dtype=bool)  # shape = (1, 64)
    for row, col in possible_actions:
        mask[0][(row * 8) + col] = False  # do not mask a possible action

    if np.random.random() > epsilon:        
        observation = tf.expand_dims(observation, axis=0)
        observation = tf.keras.backend.eval(observation)
        
        prediction = model.predict(observation, verbose=0)  # [0.4 ... 0.6] (64, )       
        prediction = tf.where(mask, -1e9, prediction)  # same as torch.masked_fill
        prediction = tf.nn.softmax(prediction, axis=None, name=None)  # all masked prob equal to 0 after this step
        
        action = tf.argmax(prediction[0], axis=1)
        action = int(tf.keras.backend.eval(action))
        
        print("predict", action)

    else:
        action = random.choice(list(possible_actions))
        action = (action[0] * 8) + action[1]        
    return action

In [10]:
replay_buffer = deque(maxlen=20000)
update_target_model = 5

In [11]:
def replay(replay_buffer, batch_size, model, target_model):
    
    if len(replay_buffer) < batch_size:
        return
    
    samples = random.sample(replay_buffer, batch_size)
    
    target_batch = []
    zipped_samples = list(zip(*samples))
    states, actions, rewards, new_states, dones = zipped_samples
    
    targets = target_model.predict(np.array(states), verbose = 0)
    q_values = model.predict(np.array(new_states), verbose = 0)

    for i in range(batch_size):
        q_value = max(q_values[i][0])
        target = targets[i].copy()
                
        if dones[i]:
            target[0][actions[i]] = rewards[i]
        else:
            target[0][actions[i]] = rewards[i] + q_value*GAMMA
        
        target_batch.append(target)
    
    history = model.fit(np.array(states), np.array(target_batch), epochs=3, verbose=0)
    
    return history

In [12]:
def update_model_handler(epoch, update_target_model, model, target_model):
    
    if epoch > 0 and epoch % update_target_model == 0:
        target_model.set_weights(model.get_weights())
        print('update target_model')

In [13]:
model.compile(optimizer='adam',
#               loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              loss=tf.keras.losses.MeanSquaredError(),
              metrics=['accuracy'])

In [None]:
# done = False

# observation, info = env.reset()
# observation = observation["state"].reshape(1,64)
# next_possible_actions = info["next_possible_actions"]

# action =  epsilon_greedy_action_selection(model, 0.0, observation, next_possible_actions)
# print(action)

# next_observation, reward, done, truncated, info = env.step(action)
# next_observation = next_observation["state"].reshape((1,64))
# next_observation

In [None]:
# for epoch in range(EPOCHS):
#     replay_buffer.append((observation, action, reward, next_observation, done))

# observation = next_observation

In [None]:
# history = replay(replay_buffer, BATCH_SIZE, model, target_model)

In [None]:
# update_model_handler(1, update_target_model, model, target_model)

In [None]:
best_so_far = 0

for epoch in range(EPOCHS):
    
    observation, info = env.reset()
    observation = observation["state"].reshape((1,64))
    next_possible_actions = info["next_possible_actions"]
    
    done = False

    points = 0
    while not done:
        action =   epsilon_greedy_action_selection(model, epsilon, observation, next_possible_actions)
#         print(next_possible_actions, action)
        
        next_observation, reward, done, truncated, info = env.step(action)
        next_observation = next_observation["state"].reshape((1,64))
        next_possible_actions = info["next_possible_actions"]

        replay_buffer.append((observation, action, reward, next_observation, done))
        
        observation = copy.deepcopy(next_observation)
        points += reward

    hist = replay(replay_buffer, BATCH_SIZE, model, target_model)
    if not(hist is None):
        print("Loss: ", hist.history['loss'], "Accuracy:", hist.history['accuracy'],)
        
    epsilon *= EPSILON_REDUCE # eps * 0.995
    
    update_model_handler(epoch, update_target_model, model, target_model)
    
    if points > best_so_far:
        best_so_far = points
        
    if epoch%25 == 0:
        print(f"{epoch}: POINTS: {points} eps: {epsilon} BSF: {best_so_far}")