In [1]:
import numpy as np
import random
import os

from collections import deque
import gym
import othello

In [2]:
import tensorflow as tf

from tensorflow.keras.models import Sequential, clone_model
from tensorflow.keras.layers import Dense, Activation, LeakyReLU, BatchNormalization, Flatten
from tensorflow.keras.optimizers import Adam

os.environ['TF_GPU_THREAD_MODE'] = 'gpu_private'
os.environ['TF_GPU_THREAD_COUNT'] = '4' #if not hvd_utils.is_using_hvd() else str(hvd.size())

from rl.agents.dqn import DQNAgent

2022-09-24 19:28:03.147380: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def set_gpu(gpu_ids_list):
    gpus = tf.config.list_physical_devices('GPU')
    if gpus:
        try:
            gpus_used = [gpus[i] for i in gpu_ids_list]
            tf.config.set_visible_devices(gpus_used, 'GPU')
            for gpu in gpus_used:
                tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
        except RuntimeError as e:
            # Visible devices must be set before GPUs have been initialized
            print(e)


set_gpu([0])

1 Physical GPUs, 1 Logical GPU
Metal device set to: AMD Radeon Pro 555


2022-09-24 19:28:07.607142: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-24 19:28:07.607880: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-09-24 19:28:07.607936: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-24 19:28:07.608111: I tensorflow/core/common_runtime/pluggable_device/plug

In [4]:
env_name = "othello:othello-v0"
env = gym.make(env_name)
env.observation_space['state']

Box(0.0, 64.0, (64,), float32)

In [5]:
nb_actions = env.action_space.n
nb_observations = env.observation_space['state'].shape

In [12]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + nb_observations))
model.add(Dense(16))
model.add(Activation("relu"))

model.add(Dense(32))
model.add(Activation("relu"))

model.add(Dense(64))
model.add(BatchNormalization())
model.add(LeakyReLU())

model.add(Dense(128))
model.add(Activation("relu"))

model.add(Dense(128))
model.add(Activation("relu"))

model.add(Dense(64))
model.add(BatchNormalization())
model.add(LeakyReLU())

model.add(Dense(32))
model.add(Activation("relu"))

model.add(Dense(nb_actions))
model.add(Activation("relu"))

print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 64)                0         
                                                                 
 dense_8 (Dense)             (None, 16)                1040      
                                                                 
 activation_6 (Activation)   (None, 16)                0         
                                                                 
 dense_9 (Dense)             (None, 32)                544       
                                                                 
 activation_7 (Activation)   (None, 32)                0         
                                                                 
 dense_10 (Dense)            (None, 64)                2112      
                                                                 
 batch_normalization_2 (Batc  (None, 64)              

In [18]:
target_model = clone_model(model)

In [19]:
EPOCHS = 500
BATCH_SIZE = 128
epsilon = 1.0
EPSILON_REDUCE = 0.995
LEARNING_RATE = 0.001
GAMMA = 0.95

In [20]:
def epsilon_greedy_action_selection(model, epsilon, observation):
    
    if np.random.random() > epsilon:
        
        observation= tf.expand_dims(observation, axis=0)
    
        prediction = model.predict(observation, verbose=0)  # [0.4, 0.6]
        action = np.argmax(prediction)
    else:
        action = np.random.randint(0, env.action_space.n)
        
    return action

In [21]:
replay_buffer = deque(maxlen=20000)
update_target_model = 10

In [22]:
def replay(replay_buffer, batch_size, model, target_model):
    
    if len(replay_buffer) < batch_size:
        return
    
    samples = random.sample(replay_buffer, batch_size)
    
    target_batch = []
    zipped_samples = list(zip(*samples))
    states, actions, rewards, new_states, dones = zipped_samples
    targets = target_model.predict(np.array(states), verbose = 0)
    q_values = model.predict(np.array(new_states), verbose = 0)
    
    for i in range(batch_size):
        q_value = max(q_values[i][0])
        target = targets[i].copy()
        
        if dones[i]:
            target[0][actions[i]] = rewards[i]
        else:
            target[0][actions[i]] = rewards[i] + q_value*GAMMA
        
        target_batch.append(target)
    
    model.fit(np.array(states), np.array(target_batch), epochs=5, verbose=0)

In [23]:
def update_model_handler(epoch, update_target_mode, model, target_model):
    
    if epoch > 0 and epoch % update_target_model == 0:
        target_model.set_weights(model.get_weights())

In [24]:
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(), optimizer=(Adam(learning_rate=LEARNING_RATE)))

In [26]:
best_so_far = 0

for epoch in range(EPOCHS):
    
    observation, info = env.reset()
    
    observation = observation["state"].reshape((1,64))
    done = False
    
    points = 0
    while not done:
        action =  epsilon_greedy_action_selection(model, epsilon, observation)
        next_observation, reward, done, truncated, info = env.step(action)
        next_observation = next_observation["state"].reshape((1,64))
        replay_buffer.append((observation, action, reward, next_observation, done))
        
        observation = next_observation
        points += 1
        
        replay(replay_buffer, BATCH_SIZE, model, target_model)
        
    epsilon *= EPSILON_REDUCE # eps * 0.995
    
    update_model_handler(epoch, update_target_model, model, target_model)
    
    if points > best_so_far:
        best_so_far = points
        
    if epoch%25 == 0:
        print(f"{epoch}: POINTS: {points} eps: {epsilon} BSF: {best_so_far}")

AssertionError: Invalid Next Action