In [None]:
from collections import Counter             # sys.version = 3.9.6
import datetime as dt
from IPython.display import clear_output    # 8.18.1
import matplotlib.pyplot as plt             # 3.8.3
import numpy as np                          # 1.26.4
import random
import seaborn as sns                       # 0.13.2
import simpleaudio as sa                    # 1.0.4                                      
import tensorflow as tf                     # 2.16.1
from tensorflow import keras                # 2.16.1

tf.config.set_visible_devices([], 'GPU')

seed = 111
np.random.seed(seed)
tf.random.set_seed(seed)
random.seed(seed)

# Playground functions

In [None]:
# Functions independent of the model

def start_round():
    # Shuffle dominos and divde them between undrawn (U), the player (P), opponent (O), table (T)
    dominos = [[i, j] for i in range(7) for j in range(i+1)]
    random.shuffle(dominos)
    U = dominos; P = []; O = []; T = []
    for _ in range(7):
        P.append(U.pop(0))
    for _ in range(7):
        O.append(U.pop(0))
    return U, P, O, T

def print_board(U, O, T, P, P_points, O_points):
    print(f'Player points: {P_points} - Opponent points: {O_points}')
    print(f'U: {U}')
    print(f'O: {O}')
    print(f'T: {T}')
    print(f'P: {P}')

def movable_dominos(X, T):
    # Find which dominos can be placed on the left and right end of the chain
    if len(T) == 0:
        movable_to_left = X
        movable_to_right = X
    else:
        movable_to_left = [x for x in X if T[0][0] in x]
        movable_to_right = [x for x in X if T[-1][1] in x]
    return movable_to_left, movable_to_right

def check_can_move(X, T):
    # Check if the player can move
    movable_to_left, movable_to_right = movable_dominos(X, T)
    if len(movable_to_left) > 0 or len(movable_to_right) > 0:
        can_move = True
    else:
        can_move = False
    return can_move

def round_to_5(number):
    return round(number / 5) * 5

def calculate_points(T):
    # Calculate points after adding a domino to the chain
    if len(T) == 1 and round_to_5(T[0][0] + T[0][1]) == T[0][0] + T[0][1]:
        return T[0][0] + T[0][1]
    if len(T) > 1:
        left_value = 2*T[0][0] if T[0][0] == T[0][1] else T[0][0]
        right_value = 2*T[-1][1] if T[-1][0] == T[-1][1] else T[-1][1]
        if round_to_5(left_value + right_value) == left_value + right_value:
            return left_value + right_value
    return 0

def remove_domino(X, domino):
    # Remove a domino from a list (e.g. the players hand)
    return [x for x in X if not x == domino]

def add_domino(T, domino, position):
    # Add a domino to a list (e.g. the table)
    if len(T) == 0:
        T.append(domino)
        return T
    if position == 'L' and T[0][0] == domino[1]:
        T.insert(0, domino)
    elif position == 'L' and T[0][0] == domino[0]:
        T.insert(0, [domino[1], domino[0]])
    elif position == 'R' and T[-1][1] == domino[0]:
        T.append(domino)
    elif position == 'R' and T[-1][1] == domino[1]:
        T.append([domino[1], domino[0]])
    return T

def move_domino(X, T, domino, position):
    # Move a domino from one list to another (e.g. the players hand to the table)
    if domino not in X:
        raise ValueError(f"Domino {domino} can't be moved")
    else:
        X = remove_domino(X, domino)
        T = add_domino(T, domino, position)
        points = calculate_points(T)
    return X, T, points

def draw_dominos(U, X, T):
    # Check if the player can move and draw dominoes if necessary
    drawn = False
    if not check_can_move(X, T):
        left_value = T[0][0]
        right_value = T[-1][1]
        while len(U) > 0 and not drawn:
            domino = U.pop(0)
            X.append(domino)
            if left_value in domino or right_value in domino:
                drawn = True
        drawn = True
    return U, X, drawn

def make_move(U, P1, P2, T, points_P1, points_P2, domino = None, position = None):
    # A complete make move funtion (calculating points, checking for round and game end)
    def total_points(X):
        return sum([sum(x) for x in X])
    round_winner = None
    game_winner = None
    points_for_P1 = 0
    points_for_P2 = 0

    if domino is not None: 
        P1, T, points_for_P1 = move_domino(P1, T, domino, position)
        if len(P1) == 0:
            points_for_P1 += round_to_5(total_points(P2))
            round_winner = 'P1'
    else:
        if not check_can_move(P2, T): 
            points_for_P1 = round_to_5(max(0, total_points(P2) - total_points(P1)))
            points_for_P2 = round_to_5(max(0, total_points(P1) - total_points(P2)))       
            round_winner = 'P1' if points_for_P1 > points_for_P2 else 'P2'

    points_P1 += points_for_P1
    points_P2 += points_for_P2

    if points_P1 >= 100:
        game_winner = 'P1'
    elif points_P2 >= 100:
        game_winner = 'P2'

    return U, P1, T, points_for_P1, points_for_P2, points_P1, points_P2, round_winner, game_winner

def choose_domino_at_random(U, X, T):
    # Benchmark dummy algorithm - placing a random domino
    U, X, drawn = draw_dominos(U, X, T)
    grads = 0
    movable_to_left, movable_to_right = movable_dominos(X, T)
    left_choice = random.choice(movable_to_left) if len(movable_to_left) > 0 else []
    right_choice = random.choice(movable_to_right) if len(movable_to_right) > 0 else []
    if len(left_choice) > 0 and len(right_choice) > 0:
        position = random.choice(['L', 'R'])
        if position == 'L':
            return U, X, left_choice, 'L', grads, drawn
        elif position == 'R':
            return U, X, right_choice, 'R', grads, drawn
    elif len(left_choice) > 0:
        return U, X, left_choice, 'L', grads, drawn
    elif len(right_choice) > 0:
        return U, X, right_choice, 'R', grads, drawn
    else:
        return U, X, None, None, grads, drawn
    
def table_to_observation(amt_O, T, P, history):
    # Prepering a model input - function to redefine later
    pass

def legal_actions(T, P):
    # Show legal acctions for the legal actions mask
    dominos = [[i, j] for i in range(7) for j in range(i+1)]
    legal_actions = np.full(28 * 2, 0)
    movable_to_left, movable_to_right = movable_dominos(P, T)
    for m in movable_to_left:
        legal_actions[dominos.index(m)] = 1
    for m in movable_to_right: 
        legal_actions[dominos.index(m) + 28] = 1
    return legal_actions.reshape((1, 56))

def translate_action_to_move(action):
    # Translate a model output action to a move
    if action < 28:
        position = 'L'
    else:
        position = 'R'
        action += -28
    dominos = [[i, j] for i in range(7) for j in range(i+1)]
    domino = dominos[action]
    return domino, position
    
def choose_domino_using_model(len_O, U, P, T, history, model, calculate_grads):
    # Choosing a domino using a model
    U, P, drawn = draw_dominos(U, P, T)
    grads = 0
    observation = table_to_observation(len_O, T, P, history)
    legal_actions_mask = legal_actions(T, P)

    if sum(legal_actions_mask[0]) == 0:
        return U, P, None, None, grads, drawn

    if calculate_grads:

        with tf.GradientTape() as tape:
            predictions = model([observation, legal_actions_mask], training=True)
            probabilities = predictions[0]
            action = np.random.choice(len(probabilities), p=probabilities.numpy())
            target = np.zeros(model.output_shape[1]); target[action] = 1
            y_target = tf.convert_to_tensor(target, dtype=tf.float32)
            loss = tf.reduce_mean(model.loss(y_target, probabilities))

        grads = tape.gradient(loss, model.trainable_variables) # Computing gradients

    else:

        predictions = model([observation, legal_actions_mask], training=True)
        probabilities = predictions[0]
        action = np.random.choice(len(probabilities), p=probabilities.numpy())

    domino, position = translate_action_to_move(action)

    if domino not in P:
        print(predictions[0])
        raise ValueError(f"Model chose illegal action: moving {domino} {position}")

    return U, P, domino, position, grads, drawn

def calc_rewards(whose_move, O_dominos, T_dominos, P_dominos, points_earned, had_to_draw, round_winners, game_winners, 
                 all_grads):
    # Defining rewards for model calibration - function to redefine later
    P_grads, O_grads, P_rewards, O_rewards = None, None, None, None
    return P_grads, O_grads, P_rewards, O_rewards

def play_one_game(P_model, O_model, print_rounds = False, calculate_grads = True):
    # Play a complete game of dominos between two players (random vs. random, random vs. model, model vs. model)
    starting_player, round_winner, game_winner = random.choice(['P', 'O']), None, None
    P_points, O_points, i = 0, 0, 0
    points_earned, round_winners, game_winners, whose_move, had_to_draw, P_dominos, O_dominos, T_dominos, all_grads = \
        [], [], [], [], [], [], [], [], []

    while game_winner is None and i < 1000:
        round_winner = None
        U, P, O, T = start_round()
        print_board(U, O, T, P, P_points, O_points) if print_rounds else None
        
        while round_winner is None and game_winner is None and i < 1000:
            last_moved=None
            if starting_player == 'P':
                i += 1
                if P_model == 'random':
                    U, P, domino, position, grads_P, drew_domino = choose_domino_at_random(U, P, T)
                else:
                    U, P, domino, position, grads_P, drew_domino = \
                        choose_domino_using_model(O, U, P, T, (round_winners, whose_move, had_to_draw, T_dominos, 'P'), 
                                                  P_model, calculate_grads)
                U, P, T, points_for_P, points_for_O, P_points, O_points, round_winner, game_winner = \
                    make_move(U, P, O, T, P_points, O_points, domino, position)
                last_moved = 'P'
                print(f'Move by P, move {i}') if print_rounds else None
                print_board(U, O, T, P, P_points, O_points) if print_rounds else None
                if round_winner is not None:
                    round_winner = 'P' if round_winner == 'P1' else 'O'
                    starting_player = round_winner
                if game_winner is not None:
                    game_winner = 'P' if game_winner == 'P1' else 'O'

                whose_move.append(last_moved)
                O_dominos.append(O.copy())
                T_dominos.append(T.copy())
                P_dominos.append(P.copy())
                points_earned.append(points_for_P - points_for_O)
                had_to_draw.append(drew_domino)
                round_winners.append(round_winner)
                game_winners.append(game_winner)
                all_grads.append(grads_P)

            if round_winner is None and game_winner is None:
                starting_player = 'P'
                i += 1
                if O_model == 'random':
                    U, O, domino, position, grads_O, drew_domino = choose_domino_at_random(U, O, T)
                else:
                    U, O, domino, position, grads_O, drew_domino = \
                        choose_domino_using_model(P, U, O, T, (round_winners, whose_move, had_to_draw, T_dominos, 'O'), 
                                                  O_model, calculate_grads)
                U, O, T, points_for_O, points_for_P, O_points, P_points, round_winner, game_winner = \
                    make_move(U, O, P, T, O_points, P_points, domino, position)
                last_moved = 'O'
                print(f'Move by O, move {i}') if print_rounds else None
                print_board(U, O, T, P, P_points, O_points) if print_rounds else None
                if round_winner is not None:
                    round_winner = 'O' if round_winner == 'P1' else 'P'
                    starting_player = round_winner
                if game_winner is not None:
                    game_winner = 'O' if game_winner == 'P1' else 'P'

                whose_move.append(last_moved)
                O_dominos.append(O.copy())
                T_dominos.append(T.copy())
                P_dominos.append(P.copy())
                points_earned.append(points_for_O - points_for_P)
                had_to_draw.append(drew_domino)
                round_winners.append(round_winner)
                game_winners.append(game_winner)
                all_grads.append(grads_O)

        print(f'Round winner: {round_winner}') if print_rounds else None
        print(f'Game winner: {game_winner}') if print_rounds and game_winner is not None else None

        P_grads, O_grads, P_rewards, O_rewards = calc_rewards(whose_move, O_dominos, T_dominos, 
                                                              P_dominos, points_earned, had_to_draw, 
                                                              round_winners, game_winners, all_grads)
    # return whose_move, O_dominos, T_dominos, P_dominos, points_earned, had_to_draw, round_winners, game_winners, all_grads # To use to design new reward functions
    return game_winner, P_grads, O_grads, P_rewards, O_rewards, 

def check_model_vs_model_distribution(P_model, O_model, stats_amount, games_amt, 
                                      print_rounds = False, calculate_grads = False):
    # A function to visually compare the performance of two models
    P_win_percentage = []

    for i in range(stats_amount):
        all_games = []
        for j in range(games_amt):
            game_winner, P_grads, O_grads, P_rewards, O_rewards = play_one_game(P_model, O_model, print_rounds, calculate_grads)
            all_games.append(game_winner)

        counts = Counter(all_games)
        percent_O = counts['O'] / games_amt
        percent_P = counts['P'] / games_amt

        P_win_percentage.append(percent_P)

    print(f'P_win_percentage mean: {round(100*np.mean(P_win_percentage), 1)}%, std: {round(100*np.std(P_win_percentage), 1)}%')

    sns.histplot(P_win_percentage, kde=True, bins=10, color='blue').set_title('Distribution of P_win_percentage')

# Functions for model traininng

def play_sound():
    # Inform about calibration ending
    sa.play_buffer((0.5 * np.sin(2 * np.pi * 440 * np.linspace(0, 1, 44100)) * (2**15 - 1)
                    ).astype(np.int16), 1, 2, 44100).wait_done()

def discount_rewards(rewards, discount_factor):
    # Discounting rewards from a move over the whole game
    rewards = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        rewards[step] += rewards[step + 1] * discount_factor
    return rewards

def discount_rewards_per_round(rewards, round_winners, game_winners, discount_factor):
    # Discount rewards from a move over the whole round
    rewards = np.array(rewards)
    for step in range(len(rewards) - 2, -1, -1):
        if round_winners[step] is None and game_winners[step] is None:
            rewards[step] += rewards[step + 1] * discount_factor
    return rewards

def normalize_rewards(rewards):
    # Normalize the rewards by centering them and setting std to 1
    rewards = (rewards - np.mean(rewards)) / np.std(rewards)
    return rewards   

def weigh_grads(gradients, rewards):
    # Weigh gradients with rewrards
    weighted_grads = []
    for i in range(len(gradients)):
        if gradients[i] != 0:
            discounted_grads = [grad * rewards[i] for grad in gradients[i]]
            weighted_grads.append(discounted_grads)
    has_nans = any(tf.reduce_any(tf.math.is_nan(grad)) for grads in weighted_grads for grad in grads)
    if has_nans:
        return None
    else:
        return weighted_grads
    
def calc_mean_grads(grads):
    # Add gradients from multiple steps into one gradient for computational efficiency
    transposed_grads = list(zip(*grads))
    mean_grads = []
    for transposed_grad in transposed_grads:
        mean_grads.append(tf.reduce_mean(transposed_grad, axis=0))
    return mean_grads

def update_model(optimizer, model, gradients, rewards):
    # Update the model using the weighted gradients
    weighted_grads = weigh_grads(gradients, rewards)

    if weighted_grads is not None:
        mean_grads = calc_mean_grads(weighted_grads)
        optimizer.apply_gradients(zip(mean_grads, model.trainable_variables))
    else:
        print("NaN values detected in weighted gradients. Model parameters were not updated.")
    return model

def plot_rewards(rewards_for_model, lag = 10, lag2 = 100):
    # Plot rewards to track the training process
    mean_rewards = [np.mean(rewards_for_model[i-lag:i]) for i in range(lag, len(rewards_for_model) + 1)]
    mean_rewards2 = [np.mean(rewards_for_model[i-lag2:i]) for i in range(lag2, len(rewards_for_model) + 1)]
    plt.figure(figsize=(12, 3))
    plt.plot(range(lag, len(rewards_for_model) + 1), mean_rewards, 'orange')
    plt.plot(range(lag2, len(rewards_for_model) + 1), mean_rewards2, 'black')
    plt.axhline(y=mean_rewards2[-1], color='lightgray', linestyle='--') if mean_rewards2 != [] else None
    plt.xlabel('Episode')
    plt.title(f'Mean rewards for the model over {lag} (orange) and {lag2} (black) episodes')
    plt.show()

def train_model(model, calibrate_from_model, benchmark_for_stopping_calibration = 80, text = '', benchmark_for_halving_lr = 0, 
                learning_rate=0.0025, limit_of_simulations = 10000, amt_games_to_average = 4):
    # Train the model by playing against the random player or another model
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    rewards_for_model = []
    all_games_winner = []
    learning_rate_halved = False

    if calibrate_from_model:
        benchmark_model = keras.models.clone_model(model)
        benchmark_model.set_weights(model.get_weights())

    for i in range(round(limit_of_simulations/amt_games_to_average)):

        all_P_rewards = []
        all_P_grads = []

        for j in range(amt_games_to_average): # We update model only after averaging over amt_games_to_average (e.g. 4) games

            if calibrate_from_model:
                game_winner, P_grads, O_grads, P_rewards, O_rewards = play_one_game(model, benchmark_model, 
                                                                                    print_rounds = False, calculate_grads = True)
            else:
                game_winner, P_grads, O_grads, P_rewards, O_rewards = play_one_game(model, 'random', 
                                                                                    print_rounds = False, calculate_grads = True)
            
            sum_P_rewards = sum(P_rewards)

            rewards_for_model.append(sum_P_rewards)

            all_games_winner.append(game_winner)

            if sum_P_rewards != 0:
                all_P_rewards = all_P_rewards + P_rewards
                all_P_grads = all_P_grads + P_grads

        all_P_rewards_normalized = normalize_rewards(all_P_rewards)
        model = update_model(optimizer, model, all_P_grads, all_P_rewards_normalized)

        games_won_by_model = sum([1 if w == 'P' else 0 for w in all_games_winner[-101:-1]])

        if (i+1) % 5 == 0: # Check the gradient impact
            clear_output(wait=True)
            avg_reward = np.mean(rewards_for_model[-101:-1])
            print(text)
            print(f'Step {amt_games_to_average*(i + 1)}')
            print(f'Rewards in last game: {P_rewards}')
            print(f'! Percentage of games won by model in last 100 games: {games_won_by_model}%')
            if benchmark_for_halving_lr != 0 and not learning_rate_halved and games_won_by_model > benchmark_for_halving_lr:
                learning_rate = learning_rate / 2
                optimizer = keras.optimizers.Adam(learning_rate = learning_rate)
                learning_rate_halved = True
            print(f'! Average reward in last 100 games: {np.round(avg_reward,2)}')
            print(f'Learning rate in last game: {learning_rate}')
            for j in range(len(model.trainable_variables)):
                if P_grads[-1] != 0:
                    update_ratio = np.abs(P_grads[-1][j].numpy()).sum() / np.abs(model.trainable_variables[j].numpy()).sum()
                    print(f"Sum of gradients: {np.abs(P_grads[-1][j].numpy()).sum():.5f}, fraction of model variables: {update_ratio:.5f}")
            plot_rewards(rewards_for_model, lag = 10, lag2 = 100)

        if games_won_by_model > benchmark_for_stopping_calibration:
            break # End training if the model wins enough of the last 100 games

    return model

# Define the reinforcement learning algorithm

### Define the model

In [None]:
def table_to_observation(len_O, T, P, history):
    # Translating the game into an observation for the model
    def find_index_of_full_domino(domino):
        dominos = [[i, j] for i in range(7) for j in range(i+1)]
        sorted_domino = sorted(domino, reverse=True)
        index = dominos.index(sorted_domino)
        return index
    def find_index_of_half_domino(domino, position):
        if domino[0] == domino[1]:
            index = 7 + domino[0]
        else:
            index = domino[0] if position == 'L' else domino[1]
        return index
    def count_domino_halfs(X):
        placed_half_dominos = np.zeros(7)
        for domino in X:
            placed_half_dominos[domino[0]] += 1
            placed_half_dominos[domino[1]] += 1 if domino[0] != domino[1] else 0
        return placed_half_dominos
    def calculate_values_that_drew(history):
        round_winners, whose_move, had_to_draw, T_dominos, now_to_move = history # The history of the game
        drawn_values = np.zeros(7)
        for i in range(len(whose_move) - 1):
            i += 1
            if round_winners[-i] is not None:
                break
            if whose_move[-i] != now_to_move and had_to_draw[-i]:
                drawn_values[T_dominos[-i-1][0][0]] = 1
                drawn_values[T_dominos[-i-1][-1][1]] = 1
                break
        return drawn_values
              
    table = np.full(28*3, 0)
    
    if len(T) > 0:
        table[find_index_of_half_domino(T[0], 'L')] = 1
        table[14 + find_index_of_half_domino(T[-1], 'R')] = 1

    for p in P:
        table[28 + find_index_of_full_domino(p)] = 1
    for t in T:
        table[28*2 + find_index_of_full_domino(t)] = 1

    table = np.concatenate((table, calculate_values_that_drew(history)))

    return table.reshape(1, len(table))

def calc_rewards(whose_move, O_dominos, T_dominos, P_dominos, points_earned, had_to_draw, round_winners, game_winners, all_grads):
    # Defining the final reward structure that will be used
    P_rewards, P_grads = [], []
    O_rewards, O_grads = [], []
    P_rewards_for_winning_rounds = list(np.zeros(len(whose_move)))
    P_rewards_for_drawing = list(np.zeros(len(whose_move)))
    P_rewards_for_being_protected = list(np.zeros(len(whose_move)))
    P_rewards_for_earning_points = list(np.zeros(len(whose_move)))
    P_rewards_for_diversification = list(np.zeros(len(whose_move)))
    P_rewards_for_placing_double = list(np.zeros(len(whose_move)))

    def number_in_dominos(element, list_of_lists):
        return any(element in sublist for sublist in list_of_lists)
    
    def filter_lists(list_to_filter, list_of_criteria, searched_criteria):
        return [list_to_filter[i] for i in range(len(list_of_criteria)) if list_of_criteria[i] == searched_criteria]
    
    def sum_lists(lists):
        return [round(sum(elements), 2) for elements in zip(*lists)]
    
    def placed_a_double_domino(T, T_before):
        placed_double = False
        if T[0] != T_before[0] and T[0][0] == T[0][1]:
            placed_double = True
        elif T[-1] != T_before[-1] and T[-1][0] == T[-1][1]:
            placed_double = True
        return placed_double

    P_grads = filter_lists(all_grads, whose_move, 'P')

    points_in_round_P = 0
    points_in_round_O = 0
    for i in range(len(whose_move)):
        # Rewards for winning a round
        if whose_move[i] == 'P':
            points_in_round_P += points_earned[i]
        elif whose_move[i] == 'O':
            points_in_round_O += points_earned[i]
        if round_winners[i] is not None or game_winners[i] is not None:
            if points_in_round_P >= points_in_round_O:
                P_rewards_for_winning_rounds[i] += 2.5
            else:
                P_rewards_for_winning_rounds[i] += -2.5
            points_in_round_P, points_in_round_O = 0, 0
        # Rewards for making opponent draw
        if round_winners[i] is None and game_winners[i] is None and \
            whose_move[i] == 'P' and whose_move[i+1] == 'O' and had_to_draw[i+1]:
                P_rewards_for_drawing[i] += 10 # 7.5
        # Rewards for being protected from both sides
        if len(T_dominos[i]) > 0 and len(P_dominos[i]) > 0 and number_in_dominos(T_dominos[i][0][0], P_dominos) and \
            number_in_dominos(T_dominos[i][-1][1], P_dominos):
                    P_rewards_for_being_protected[i] += 2.5
        # Reward for getting rid of a double domino
        if i > 0 and round_winners[i] is None and game_winners[i] is None and \
            whose_move[i] == 'P' and whose_move[i-1] == 'O' and placed_a_double_domino(T_dominos[i], T_dominos[i-1]):
            P_rewards_for_placing_double[i] = 2.5
        # Reward for having a diversified set of dominos
        if whose_move[i] == 'P':
             for x in range(0):
                  P_rewards_for_diversification[i] += number_in_dominos(x, P_dominos) / 2
        # Rewards for earning points
        P_rewards_for_earning_points[i] = points_earned[i]
        # Punishment for the opponent earning points
        if round_winners[i] is None and game_winners[i] is None and \
            whose_move[i] == 'P' and whose_move[i+1] == 'O':
            P_rewards_for_earning_points[i] += min(points_earned[i+1], 20) / 5

    P_rewards_for_winning_rounds = discount_rewards_per_round(P_rewards_for_winning_rounds, round_winners, game_winners, 0.99)

    P_rewards_for_winning_rounds = filter_lists(P_rewards_for_winning_rounds, whose_move, 'P')
    P_rewards_for_drawing = filter_lists(P_rewards_for_drawing, whose_move, 'P')
    P_rewards_for_being_protected = filter_lists(P_rewards_for_being_protected, whose_move, 'P')
    P_rewards_for_earning_points = filter_lists(P_rewards_for_earning_points, whose_move, 'P')
    P_rewards_for_diversification = filter_lists(P_rewards_for_diversification, whose_move, 'P')
    P_rewards_for_placing_double = filter_lists(P_rewards_for_placing_double, whose_move, 'P')

    P_rewards = sum_lists([P_rewards_for_winning_rounds,
                           P_rewards_for_drawing,
                           P_rewards_for_being_protected,
                           P_rewards_for_earning_points,
                           P_rewards_for_diversification,
                           P_rewards_for_placing_double])

    return P_grads, O_grads, P_rewards, O_rewards

In [None]:
@keras.utils.register_keras_serializable() # Registring the custom layer with Keras
class MaskedSoftmax(keras.layers.Layer):
    def call(self, inputs, mask):
        mask = tf.cast(mask, tf.float32)
        masked_logits = inputs + (1 - mask) * -1e9  # Setting probability of illegal actions to 0
        return tf.nn.softmax(masked_logits)

n_inputs = 28 * 3 + 7
n_outputs = 56
dropout_rate = 0.1

observation_input = keras.layers.Input(shape=(n_inputs,))
legal_actions_input = keras.layers.Input(shape=(n_outputs,))
x = keras.layers.Dense(100, activation="elu")(observation_input)
x = keras.layers.Dropout(rate=dropout_rate)(x)
x = keras.layers.Dense(100, activation="elu")(x)
x = keras.layers.Dropout(rate=dropout_rate)(x)
logits = keras.layers.Dense(n_outputs)(x)
action_probabilities = MaskedSoftmax()(logits, legal_actions_input)

loss_fn = keras.losses.categorical_crossentropy

model = keras.models.Model(inputs=[observation_input, legal_actions_input], outputs=action_probabilities)
model.compile(loss=loss_fn, metrics=['accuracy'])

# model.summary()

### Check that pre-training model vs. random algorithm wins 50% of games

In [None]:
# check_model_vs_model_distribution(model, 'random', 64, 64, print_rounds = False, calculate_grads = False)

### Train the model

In [None]:
model_file_path = 'dominoes.h5'

# # UNCOMMENT IF YOU WANT TO TRAIN THE NETWORK

# times = []

# times.append(dt.datetime.now().strftime("%Y-%m-%d %H%:%M:%S"))
# model = train_model(model, calibrate_from_model = False, benchmark_for_stopping_calibration = 75, text = 'TRAINING STEP 1')
# times.append(dt.datetime.now().strftime("%Y-%m-%d %H%:%M:%S"))
# model = train_model(model, calibrate_from_model = True, benchmark_for_stopping_calibration = 65, text = 'TRAINING STEP 2')
# times.append(dt.datetime.now().strftime("%Y-%m-%d %H%:%M:%S"))
# model = train_model(model, calibrate_from_model = False, benchmark_for_stopping_calibration = 80, text = 'TRAINING STEP 3')
# times.append(dt.datetime.now().strftime("%Y-%m-%d %H%:%M:%S"))
# model = train_model(model, calibrate_from_model = True, benchmark_for_stopping_calibration = 65, text = 'TRAINING STEP 4')
# times.append(dt.datetime.now().strftime("%Y-%m-%d %H%:%M:%S"))
# model = train_model(model, calibrate_from_model = False, benchmark_for_stopping_calibration = 85, text = 'TRAINING STEP 5')
# times.append(dt.datetime.now().strftime("%Y-%m-%d %H%:%M:%S"))
# model = train_model(model, calibrate_from_model = True, benchmark_for_stopping_calibration = 60, text = 'TRAINING STEP 6')
# times.append(dt.datetime.now().strftime("%Y-%m-%d %H%:%M:%S"))
# model = train_model(model, calibrate_from_model = False, benchmark_for_stopping_calibration = 90, text = 'TRAINING STEP 7')
# times.append(dt.datetime.now().strftime("%Y-%m-%d %H%:%M:%S"))
# model = train_model(model, calibrate_from_model = True, benchmark_for_stopping_calibration = 60, text = 'TRAINING STEP 8')
# times.append(dt.datetime.now().strftime("%Y-%m-%d %H%:%M:%S"))
# model = train_model(model, calibrate_from_model = True, benchmark_for_stopping_calibration = 60, text = 'TRAINING STEP 9')
# times.append(dt.datetime.now().strftime("%Y-%m-%d %H%:%M:%S"))

# print(f'times: {times}')

# play_sound()

# # model.save(model_file_path)

# IF YOU WANT TO LOAD A TRAINED NEURAL NETWORK

model = keras.models.load_model(model_file_path, custom_objects={'MaskedSoftmax': MaskedSoftmax})

### Check model performance after training

In [None]:
check_model_vs_model_distribution(model, 'random', 100, 100, print_rounds = False, calculate_grads = False) # 82.3%, std: 3.7%
play_sound()

### Have a look at a game played by the model

In [None]:
x = play_one_game(model, 'random', print_rounds = True, calculate_grads = True)