## Use bonus plates (length of 4)
## Important notes:
### - new color model: number of channels = 4, one channel for each color  
### - plate value encoded with a number: 0.1, 0.4, 1.0  

In [50]:
import pandas as pd
import random as rd
import numpy as np
#import qgrid

import tensorflow as tf
import environment as ae

from collections import deque

#import matplotlib.pyplot as plt
#import matplotlib.patches as mpatches
#%pylab inline

## Constants

In [51]:
# CNN
LEARNING_RATE = 0.005
UPDATE_TARGET_NET = 1000

# Definitions
GAMES_TO_PLAY = 120001
REPLAY_MEMORY_SIZE = 131072
DYNAMIC_LEARNING_EPOCHS = 5
MINIBATCH_SIZE = 64
NUMBER_OF_MOVES_IN_GAME = 50
GAMMA = 0.99
ACTIONS_DIMENSION = 142

# Variables
MAXIMUM_SCORE = 0
TOTAL_SCORE_500 = 0.0
AVG_SCORE_HIST = []
TOTAL_SUCCESSFUL_MOVES_500 = 0.0
AVG_SUCC_MOVES_HIST = []
CNN_MOVE_PROB = 0.1
CNN_MOVES_COUNT = 0
CNN_SUCCESSFUL_PREDICTION = 0

# Initialize replay_memory
replay_memory = []

### Replay Memory Buffer

In [52]:
#
# Replay memory buffer
#
class ExperienceBuffer():
    '''
    Experience Replay Buffer
    Inspired by Andrea Lonza
    '''

    def __init__(self, buffer_size, gamma):
        # Constants
        self.gamma = gamma
        
        # Main Replay Memory buffer parts
        self.states_before = deque(maxlen=buffer_size)
        self.actions = deque(maxlen=buffer_size)
        self.total_rewards = deque(maxlen=buffer_size)
        self.states_after = deque(maxlen=buffer_size)
        self.last_moves = deque(maxlen=buffer_size)
   
    
    def add(self, state_before, action, reward, state_after, last_move):
        # Add certain items to corresponding buffers
        self.states_before.append(state_before)
        self.actions.append(action)
        self.total_rewards.append(reward)
        self.states_after.append(state_after)
        self.last_moves.append(last_move)
    
    
    def sample_minibatch(self, minibatch_size):
        '''
        Sample a minibatch of size batch_size
        Note1: always add the most recent completed move
        '''
        indices = rd.sample(range(len(self.states_before) - 1), minibatch_size - 1)
        # Add the most recent completed move index
        indices.append(len(self.states_before) - 1)
        
        minibatch_states_before = np.array([self.states_before[i] for i in indices]) 
        minibatch_actions = np.array([self.actions[i] for i in indices]) 
        minibatch_total_rewards = np.array([self.total_rewards[i] for i in indices]) 
        minibatch_states_after = np.array([self.states_after[i] for i in indices])  
        minibatch_last_moves = np.array([self.last_moves[i] for i in indices])   
        
        return minibatch_states_before, minibatch_actions, minibatch_total_rewards, minibatch_states_after, minibatch_last_moves
    
    
    def __len__(self):
        '''
        Return length of the current replay memory buffer
        Relevant for the first *minibatch_size* moves.
        '''
        return len(self.states_before)
    
    

## Deep Q-Nework

### Initialize Replay Buffer

In [53]:
replay_memory = ExperienceBuffer(REPLAY_MEMORY_SIZE, GAMMA)

### Initialize Online CNN and Target CNN

In [54]:
# Online CNN
Online_CNN = tf.keras.models.Sequential()
Online_CNN.add(tf.keras.layers.Conv2D(128, kernel_size=3, strides = (1, 1), padding='same', activation=tf.keras.activations.tanh, data_format = 'channels_last', input_shape=(7, 6, 4)))
Online_CNN.add(tf.keras.layers.Conv2D(192, kernel_size=3, strides = (1, 1), padding='same', activation=tf.keras.activations.tanh))    
Online_CNN.add(tf.keras.layers.Flatten())                      
Online_CNN.add(tf.keras.layers.Dense(ae.ACTIONS_DIMENSION, activation=tf.keras.activations.tanh, kernel_initializer='RandomNormal'))

# Target CNN
Target_CNN = tf.keras.models.Sequential()
Target_CNN.add(tf.keras.layers.Conv2D(128, kernel_size=3, strides = (1, 1), padding='same', activation=tf.keras.activations.tanh, data_format = 'channels_last', input_shape=(7, 6, 4)))
Target_CNN.add(tf.keras.layers.Conv2D(192, kernel_size=3, strides = (1, 1), padding='same', activation=tf.keras.activations.tanh))    
Target_CNN.add(tf.keras.layers.Flatten())                      
Target_CNN.add(tf.keras.layers.Dense(ae.ACTIONS_DIMENSION, activation=tf.keras.activations.tanh, kernel_initializer='RandomNormal'))

# Initialize optimizer
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)

# Set weights equal
Target_CNN.set_weights(Online_CNN.get_weights())

In [55]:
# Number of moves made to follow the target CNN update strategy
total_moves = 1

In [56]:
def loss(model, X, y_true, A):
    prediction = model(X)
    selected_action_values = tf.math.reduce_sum(prediction*A, axis=1)  
    return tf.keras.losses.MSE(y_true, selected_action_values)


def grad(model, inputs, targets, actions):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets, actions)
    return loss_value, tape.gradient(loss_value, model.trainable_variables)

## Dynamic Learning

In [57]:
%%time
                                                      
for game in range(GAMES_TO_PLAY):
    # Start one game
    game_score = 0
    successful_moves = 0

    # Initialize the game field
    field = np.zeros((7, 6, 4))
    field = ae.initialize_field_3D(field)

    for m in range(NUMBER_OF_MOVES_IN_GAME):
        # Total score of one move
        reward = 0

        # Whether CNN made the move
        cnn_made_move_flag = False
        
        # If replay_memory has less than 64 moves, then make a random move
        if ((len(replay_memory) < MINIBATCH_SIZE) or (rd.random() > CNN_MOVE_PROB)):
            move = rd.randint(1, ae.ACTIONS_DIMENSION)
        else:
            # CNN selects a move
            cnn_made_move_flag = True
            CNN_MOVES_COUNT = CNN_MOVES_COUNT + 1
            X_data = np.expand_dims(np.copy(field), axis=0)
            move = Target_CNN.predict(X_data).argmax() + 1

        # Make the move
        new_field, plate_a, plate_b = ae.make_move_v2_3D(field.copy(), move, ae.MOVES)

        # Calculate the score and update the field
        score, new_field = ae.calculate_score_v2_3D(new_field, plate_a, plate_b)
        
        # If the move is successful, then update the field and check if we have new sets        
        successful_move_flag = False

        # While we have new sets (thus the score is greater than 0), process them, calculate score and move plates
        while (score > 0.):
            if (not successful_move_flag):
                successful_moves = successful_moves + 1
                successful_move_flag = True

            # Add new points to the total score of the move
            reward = reward + score

            # Move plates downward, fill the upper row so, that it doesn't have "easy" sets of three
            # Start from the left lower corner (in order to reuse color_fits())
            new_field = ae.fill_field_3D(new_field)

            # Calculate score and check whether we have new sets
            score, new_field = ae.calculate_score_v2_3D(new_field, (-1, -1), (-1, -1))

        # Increase the score of the whole game
        game_score = game_score + reward
        
        # Check whether it's the last move of the current game
        last_move = m == NUMBER_OF_MOVES_IN_GAME - 1
        
        # Add new move to the replay memory
        replay_memory.add(field, move, reward, new_field, last_move)
        
        # Update CNN move statistics
        if (successful_move_flag) and (cnn_made_move_flag):
            CNN_SUCCESSFUL_PREDICTION = CNN_SUCCESSFUL_PREDICTION + 1               
        
        #
        # Train CNN based on the score
        #
        if (len(replay_memory) >= MINIBATCH_SIZE):
            # Select random MINIBATCH_SIZE moves from replay memory buffer
            samples = replay_memory.sample_minibatch(MINIBATCH_SIZE)

            # Prepare some things for training
            s_before = samples[0]
            actions = samples[1]
            rewards = samples[2]
            s_after = samples[3] 
            dones = samples[4]
            
            rewards_next = np.max(Target_CNN(s_after), axis=1)
            actual_values = np.where(dones, rewards, rewards + GAMMA*rewards_next)
            selected_actions = tf.one_hot(actions, ae.ACTIONS_DIMENSION)
            
            # Update online CNN weights: training step
            for _ in range(DYNAMIC_LEARNING_EPOCHS):             
                loss_value, grads = grad(Online_CNN, s_before, actual_values, selected_actions)
                optimizer.apply_gradients(zip(grads, Online_CNN.trainable_variables))

        # If move is successful, update the play field
        if (successful_move_flag):
            field = np.copy(new_field)
            
        # After each UPDATE_TARGET_NET moves update target CNN
        if (total_moves % UPDATE_TARGET_NET == 0):
            Target_CNN.set_weights(Online_CNN.get_weights())
            
        total_moves = total_moves + 1

    #
    # Calculate and display overall stats
    #
    # Check whether we have new maximum score
    if (game_score > MAXIMUM_SCORE):
        print(f"New maximum: {game_score}, after {game} games.")
        MAXIMUM_SCORE = game_score
        
    # After each 500 games output average game score, average number of successful moves per game
    TOTAL_SCORE_500 = TOTAL_SCORE_500 + game_score
    TOTAL_SUCCESSFUL_MOVES_500 = TOTAL_SUCCESSFUL_MOVES_500 + successful_moves
    
    if ((game % 500 == 0) and (game > 0)):
        avg_score = TOTAL_SCORE_500 / 500
        TOTAL_SCORE_500 = 0.0
        
        avg_succ_moves = TOTAL_SUCCESSFUL_MOVES_500 / 500
        TOTAL_SUCCESSFUL_MOVES_500 = 0.0

        print(f"Games: {game}, last 500 games avg score: {avg_score}, avg of successful moves: {avg_succ_moves}, loss {loss_value.numpy()}")        
        print(f"CNN made {CNN_MOVES_COUNT} moves. Successful were {CNN_SUCCESSFUL_PREDICTION}")
        
        if (CNN_SUCCESSFUL_PREDICTION / CNN_MOVES_COUNT >= CNN_MOVE_PROB):
            CNN_MOVE_PROB = CNN_MOVE_PROB + 0.1
            
        CNN_MOVES_COUNT = 0
        CNN_SUCCESSFUL_PREDICTION = 0


New maximum: 9, after 0 games.
New maximum: 78, after 1 games.
New maximum: 105, after 7 games.
New maximum: 106, after 22 games.
New maximum: 107, after 35 games.
New maximum: 148, after 51 games.
New maximum: 158, after 122 games.
New maximum: 165, after 249 games.
Games: 500, last 500 games avg score: 61.29, avg of successful moves: 9.328, loss 7.2675933837890625
CNN made 2502 moves. Successful were 273
Games: 1000, last 500 games avg score: 56.756, avg of successful moves: 8.72, loss 12.21103572845459
CNN made 5006 moves. Successful were 369
Games: 1500, last 500 games avg score: 53.518, avg of successful moves: 8.38, loss 6.176033973693848
CNN made 5068 moves. Successful were 337
New maximum: 176, after 1552 games.
Games: 2000, last 500 games avg score: 54.928, avg of successful moves: 8.648, loss 26.5347843170166
CNN made 5041 moves. Successful were 419
Games: 2500, last 500 games avg score: 55.42, avg of successful moves: 8.64, loss 12.388847351074219
CNN made 4911 moves. Succes

Games: 27500, last 500 games avg score: 54.822, avg of successful moves: 8.612, loss 15.916345596313477
CNN made 4971 moves. Successful were 386
Games: 28000, last 500 games avg score: 54.076, avg of successful moves: 8.398, loss 11.530721664428711
CNN made 5137 moves. Successful were 341
Games: 28500, last 500 games avg score: 52.412, avg of successful moves: 8.37, loss 7.795097827911377
CNN made 5060 moves. Successful were 378
Games: 29000, last 500 games avg score: 54.612, avg of successful moves: 8.404, loss 11.430401802062988
CNN made 5143 moves. Successful were 374
Games: 29500, last 500 games avg score: 55.25, avg of successful moves: 8.722, loss 56.90291213989258
CNN made 4989 moves. Successful were 403
Games: 30000, last 500 games avg score: 55.496, avg of successful moves: 8.716, loss 7.052910327911377
CNN made 4906 moves. Successful were 401
Games: 30500, last 500 games avg score: 54.866, avg of successful moves: 8.652, loss 11.29290771484375
CNN made 4978 moves. Successful 

Games: 56000, last 500 games avg score: 55.996, avg of successful moves: 8.674, loss 24.961971282958984
CNN made 4926 moves. Successful were 370
Games: 56500, last 500 games avg score: 56.244, avg of successful moves: 8.568, loss 7.239782810211182
CNN made 4959 moves. Successful were 391
Games: 57000, last 500 games avg score: 56.614, avg of successful moves: 8.71, loss 20.184162139892578
CNN made 5026 moves. Successful were 400
Games: 57500, last 500 games avg score: 57.18, avg of successful moves: 8.778, loss 12.329473495483398
CNN made 5060 moves. Successful were 364
Games: 58000, last 500 games avg score: 53.714, avg of successful moves: 8.622, loss 9.869783401489258
CNN made 4974 moves. Successful were 359
Games: 58500, last 500 games avg score: 54.164, avg of successful moves: 8.392, loss 10.709158897399902
CNN made 4993 moves. Successful were 364
Games: 59000, last 500 games avg score: 53.884, avg of successful moves: 8.432, loss 15.418221473693848
CNN made 4983 moves. Successfu

Games: 84500, last 500 games avg score: 54.944, avg of successful moves: 8.638, loss 10.141658782958984
CNN made 4971 moves. Successful were 395
Games: 85000, last 500 games avg score: 54.834, avg of successful moves: 8.634, loss 6.382908821105957
CNN made 5105 moves. Successful were 361
Games: 85500, last 500 games avg score: 55.682, avg of successful moves: 8.566, loss 26.851966857910156
CNN made 5101 moves. Successful were 394
Games: 86000, last 500 games avg score: 54.176, avg of successful moves: 8.646, loss 8.87415885925293
CNN made 4974 moves. Successful were 352
Games: 86500, last 500 games avg score: 55.124, avg of successful moves: 8.53, loss 25.591655731201172
CNN made 5020 moves. Successful were 351
Games: 87000, last 500 games avg score: 58.122, avg of successful moves: 8.966, loss 12.482908248901367
CNN made 4973 moves. Successful were 400
Games: 87500, last 500 games avg score: 54.578, avg of successful moves: 8.432, loss 11.969783782958984
CNN made 4986 moves. Successfu

Games: 113000, last 500 games avg score: 56.696, avg of successful moves: 8.748, loss 14.246347427368164
CNN made 5027 moves. Successful were 362
Games: 113500, last 500 games avg score: 54.608, avg of successful moves: 8.516, loss 6.800095558166504
CNN made 4965 moves. Successful were 403
Games: 114000, last 500 games avg score: 53.656, avg of successful moves: 8.392, loss 5.679470062255859
CNN made 5006 moves. Successful were 380
Games: 114500, last 500 games avg score: 55.112, avg of successful moves: 8.504, loss 19.49478530883789
CNN made 4840 moves. Successful were 339
Games: 115000, last 500 games avg score: 54.542, avg of successful moves: 8.578, loss 9.685721397399902
CNN made 5012 moves. Successful were 391
Games: 115500, last 500 games avg score: 56.404, avg of successful moves: 8.878, loss 22.612281799316406
CNN made 5016 moves. Successful were 379
Games: 116000, last 500 games avg score: 55.85, avg of successful moves: 8.544, loss 12.922597885131836
CNN made 5027 moves. Suc

In [None]:
# tensorboard --logdir=./logs --bind_all &

print(MAXIMUM_SCORE)

## Test part

In [None]:
#field = np.zeros((7, 6))

df = pd.DataFrame(updated_field)
qgrid_widget = qgrid.show_grid(df, show_toolbar=True)
qgrid_widget

In [None]:
updated_df = qgrid_widget.get_changed_df()
updated_field = updated_df.values
visualize_field(updated_field)

In [None]:
get_sets(updated_field)

In [None]:
score, new_field = calculate_score_v2(updated_field, (6, 3), (5, 3))
print(score)
visualize_field(updated_field)

In [None]:
#updated_field_2 = fill_field(updated_field, colors)
#visualize_field(updated_field_2)

### Now the CNN has been trained.
### Start the long reinforcement-learning cycle

In [None]:
successful_moves

In [None]:
new_field = make_move(field, move)
print(new_field)

In [None]:
temp_field = calculate_score(new_field)

In [None]:
new_field = np.multiply(new_field, 1.0 - temp_field)

In [None]:
fill_field(new_field, colors)

In [None]:
#
# Save model
#
# v1: 20190329, trained on len(replay_memory) = 294912
#aero_cnn.save("Aero_CNN_v1")

## Backup

In [None]:
#
# Create the moves dictionary
#
moves = {}

for i in range(1, 143):
    old_row, old_column, old_direction = process_move_142(i)
    
    start_row = old_row - 1
    start_col = old_column - 1
    
    if (old_direction == "down"):
        end_row = start_row + 1
        end_col = start_col
    elif (old_direction == "up"):
        end_row = start_row - 1
        end_col = start_col
    elif (old_direction == "right"):
        end_row = start_row
        end_col = start_col + 1
    else:
        end_row = start_row
        end_col = start_col - 1
        
    moves[i] = ((start_row, start_col), (end_row, end_col))
    
print(moves)