## Use bonus plates (length of 4)
## Important notes:
### - new color model: number of channels = 4, one channel for each color  
### - plate value encoded with a number: 0.1, 0.4, 1.0  

In [58]:
import pandas as pd
import random as rd
import numpy as np
#import qgrid

import tensorflow as tf
import environment as ae

from collections import deque

#import matplotlib.pyplot as plt
#import matplotlib.patches as mpatches
#%pylab inline

## Constants

In [59]:
# CNN
LEARNING_RATE = 0.005
UPDATE_TARGET_NET = 1000

# Definitions
GAMES_TO_PLAY = 131072
REPLAY_MEMORY_SIZE = 131072
DYNAMIC_LEARNING_EPOCHS = 5
MINIBATCH_SIZE = 64
NUMBER_OF_MOVES_IN_GAME = 50
GAMMA = 0.99
ACTIONS_DIMENSION = 142

# Variables
MAXIMUM_SCORE = 0
TOTAL_SCORE_500 = 0.0
AVG_SCORE_HIST = []
TOTAL_SUCCESSFUL_MOVES_500 = 0.0
AVG_SUCC_MOVES_HIST = []
CNN_MOVE_PROB = 0.1
CNN_MOVES_COUNT = 0
CNN_SUCCESSFUL_PREDICTION = 0

# Initialize replay_memory
replay_memory = []

### Replay Memory Buffer

In [60]:
#
# Replay memory buffer
#
class ExperienceBuffer():
    '''
    Experience Replay Buffer
    Inspired by Andrea Lonza
    '''

    def __init__(self, buffer_size, gamma):
        # Constants
        self.gamma = gamma
        
        # Main Replay Memory buffer parts
        self.states_before = deque(maxlen=buffer_size)
        self.actions = deque(maxlen=buffer_size)
        self.total_rewards = deque(maxlen=buffer_size)
        self.states_after = deque(maxlen=buffer_size)
        self.last_moves = deque(maxlen=buffer_size)
   
    
    def add(self, state_before, action, reward, state_after, last_move):
        # Add certain items to corresponding buffers
        self.states_before.append(state_before)
        self.actions.append(action)
        self.total_rewards.append(reward)
        self.states_after.append(state_after)
        self.last_moves.append(last_move)
    
    
    def sample_minibatch(self, minibatch_size):
        '''
        Sample a minibatch of size batch_size
        Note1: always add the most recent completed move
        '''
        indices = rd.sample(range(len(self.states_before) - 1), minibatch_size - 1)
        # Add the most recent completed move index
        indices.append(len(self.states_before) - 1)
        
        minibatch_states_before = np.array([self.states_before[i] for i in indices]) 
        minibatch_actions = np.array([self.actions[i] for i in indices]) 
        minibatch_total_rewards = np.array([self.total_rewards[i] for i in indices]) 
        minibatch_states_after = np.array([self.states_after[i] for i in indices])  
        minibatch_last_moves = np.array([self.last_moves[i] for i in indices])   
        
        return minibatch_states_before, minibatch_actions, minibatch_total_rewards, minibatch_states_after, minibatch_last_moves
    
    
    def __len__(self):
        '''
        Return length of the current replay memory buffer
        Relevant for the first *minibatch_size* moves.
        '''
        return len(self.states_before)
    
    

## Deep Q-Nework

### Initialize Replay Buffer

In [61]:
replay_memory = ExperienceBuffer(REPLAY_MEMORY_SIZE, GAMMA)

### Initialize Online CNN and Target CNN

In [62]:
# Online CNN
Online_CNN = tf.keras.models.Sequential()
Online_CNN.add(tf.keras.layers.Conv2D(128, kernel_size=3, strides = (1, 1), padding='same', activation=tf.keras.activations.tanh, data_format = 'channels_last', input_shape=(7, 6, 4)))
Online_CNN.add(tf.keras.layers.Conv2D(192, kernel_size=3, strides = (1, 1), padding='same', activation=tf.keras.activations.tanh))    
Online_CNN.add(tf.keras.layers.Flatten())                      
Online_CNN.add(tf.keras.layers.Dense(ae.ACTIONS_DIMENSION, activation=tf.keras.activations.relu, kernel_initializer='RandomNormal'))

# Target CNN
Target_CNN = tf.keras.models.Sequential()
Target_CNN.add(tf.keras.layers.Conv2D(128, kernel_size=3, strides = (1, 1), padding='same', activation=tf.keras.activations.tanh, data_format = 'channels_last', input_shape=(7, 6, 4)))
Target_CNN.add(tf.keras.layers.Conv2D(192, kernel_size=3, strides = (1, 1), padding='same', activation=tf.keras.activations.tanh))    
Target_CNN.add(tf.keras.layers.Flatten())                      
Target_CNN.add(tf.keras.layers.Dense(ae.ACTIONS_DIMENSION, activation=tf.keras.activations.relu, kernel_initializer='RandomNormal'))

# Initialize optimizer
optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)

# Set weights equal
Target_CNN.set_weights(Online_CNN.get_weights())

In [63]:
# Number of moves made to follow the target CNN update strategy
total_moves = 1

In [64]:
def loss(model, X, y_true, A):
    prediction = model(X)
    selected_action_values = tf.math.reduce_sum(prediction*A, axis=1)  
    return tf.keras.losses.MSE(y_true, selected_action_values)


def grad(model, inputs, targets, actions):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets, actions)
    return loss_value, tape.gradient(loss_value, model.trainable_variables)

## Dynamic Learning

In [None]:
%%time
                                                      
for game in range(GAMES_TO_PLAY):
    # Start one game
    game_score = 0
    successful_moves = 0

    # Initialize the game field
    field = np.zeros((7, 6, 4))
    field = ae.initialize_field_3D(field)

    for m in range(NUMBER_OF_MOVES_IN_GAME):
        # Total score of one move
        reward = 0

        # Whether CNN made the move
        cnn_made_move_flag = False
        
        # If replay_memory has less than 64 moves, then make a random move
        if ((len(replay_memory) < MINIBATCH_SIZE) or (rd.random() > CNN_MOVE_PROB)):
            move = rd.randint(1, ae.ACTIONS_DIMENSION)
        else:
            # CNN selects a move
            cnn_made_move_flag = True
            CNN_MOVES_COUNT = CNN_MOVES_COUNT + 1
            X_data = np.expand_dims(np.copy(field), axis=0)
            move = Target_CNN.predict(X_data).argmax() + 1

        # Make the move
        new_field, plate_a, plate_b = ae.make_move_v2_3D(field.copy(), move, ae.MOVES)

        # Calculate the score and update the field
        score, new_field = ae.calculate_score_v2_3D(new_field, plate_a, plate_b)
        
        # If the move is successful, then update the field and check if we have new sets        
        successful_move_flag = False

        # While we have new sets (thus the score is greater than 0), process them, calculate score and move plates
        while (score > 0.):
            if (not successful_move_flag):
                successful_moves = successful_moves + 1
                successful_move_flag = True

            # Add new points to the total score of the move
            reward = reward + score

            # Move plates downward, fill the upper row so, that it doesn't have "easy" sets of three
            # Start from the left lower corner (in order to reuse color_fits())
            new_field = ae.fill_field_3D(new_field)

            # Calculate score and check whether we have new sets
            score, new_field = ae.calculate_score_v2_3D(new_field, (-1, -1), (-1, -1))

        # Increase the score of the whole game
        game_score = game_score + reward
        
        # Check whether it's the last move of the current game
        last_move = m == NUMBER_OF_MOVES_IN_GAME - 1
        
        # Add new move to the replay memory
        replay_memory.add(field, move, reward, new_field, last_move)
        
        # Update CNN move statistics
        if (successful_move_flag) and (cnn_made_move_flag):
            CNN_SUCCESSFUL_PREDICTION = CNN_SUCCESSFUL_PREDICTION + 1               
        
        #
        # Train CNN based on the score
        #
        if (len(replay_memory) >= MINIBATCH_SIZE):
            # Select random MINIBATCH_SIZE moves from replay memory buffer
            samples = replay_memory.sample_minibatch(MINIBATCH_SIZE)

            # Prepare some things for training
            s_before = samples[0]
            actions = samples[1]
            rewards = samples[2]
            s_after = samples[3] 
            dones = samples[4]
            
            rewards_next = np.max(Target_CNN(s_after), axis=1)
            actual_values = np.where(dones, rewards, rewards + GAMMA*rewards_next)
            selected_actions = tf.one_hot(actions - 1, ae.ACTIONS_DIMENSION)
            
            # Update online CNN weights: training step
            for _ in range(DYNAMIC_LEARNING_EPOCHS):             
                loss_value, grads = grad(Online_CNN, s_before, actual_values, selected_actions)
                optimizer.apply_gradients(zip(grads, Online_CNN.trainable_variables))

        # If move is successful, update the play field
        if (successful_move_flag):
            field = np.copy(new_field)
            
        # After each UPDATE_TARGET_NET moves update target CNN
        if (total_moves % UPDATE_TARGET_NET == 0):
            Target_CNN.set_weights(Online_CNN.get_weights())
            
        total_moves = total_moves + 1

    #
    # Calculate and display overall stats
    #
    # Check whether we have new maximum score
    if (game_score > MAXIMUM_SCORE):
        print(f"New maximum: {game_score}, after {game} games.")
        MAXIMUM_SCORE = game_score
        
    # After each 500 games output average game score, average number of successful moves per game
    TOTAL_SCORE_500 = TOTAL_SCORE_500 + game_score
    TOTAL_SUCCESSFUL_MOVES_500 = TOTAL_SUCCESSFUL_MOVES_500 + successful_moves
    
    if ((game % 500 == 0) and (game > 0)):
        avg_score = TOTAL_SCORE_500 / 500
        TOTAL_SCORE_500 = 0.0
        
        avg_succ_moves = TOTAL_SUCCESSFUL_MOVES_500 / 500
        TOTAL_SUCCESSFUL_MOVES_500 = 0.0

        print(f"Games: {game}, last 500 games avg score: {avg_score}, avg of successful moves: {avg_succ_moves}, loss {loss_value.numpy()}")        
        print(f"CNN made {CNN_MOVES_COUNT} moves. Successful were {CNN_SUCCESSFUL_PREDICTION}")
        
        if (CNN_SUCCESSFUL_PREDICTION / CNN_MOVES_COUNT >= CNN_MOVE_PROB):
            CNN_MOVE_PROB = CNN_MOVE_PROB + 0.1
            
        CNN_MOVES_COUNT = 0
        CNN_SUCCESSFUL_PREDICTION = 0


New maximum: 50, after 0 games.
New maximum: 78, after 1 games.
New maximum: 98, after 4 games.
New maximum: 131, after 7 games.
New maximum: 147, after 8 games.
New maximum: 184, after 44 games.
Games: 500, last 500 games avg score: 58.97, avg of successful moves: 8.948, loss 19.484375
CNN made 2557 moves. Successful were 152
Games: 1000, last 500 games avg score: 58.538, avg of successful moves: 8.972, loss 5.515625
CNN made 2559 moves. Successful were 144
Games: 1500, last 500 games avg score: 58.956, avg of successful moves: 9.026, loss 12.796875
CNN made 2481 moves. Successful were 156
New maximum: 189, after 1701 games.
New maximum: 206, after 1921 games.
Games: 2000, last 500 games avg score: 58.908, avg of successful moves: 9.026, loss 19.046875
CNN made 2539 moves. Successful were 144
Games: 2500, last 500 games avg score: 61.652, avg of successful moves: 9.332, loss 9.765625
CNN made 2462 moves. Successful were 151
Games: 3000, last 500 games avg score: 58.178, avg of success

Games: 30000, last 500 games avg score: 60.38, avg of successful moves: 9.178, loss 9.875
CNN made 2441 moves. Successful were 142
Games: 30500, last 500 games avg score: 59.166, avg of successful moves: 8.938, loss 14.421875
CNN made 2542 moves. Successful were 131
Games: 31000, last 500 games avg score: 57.924, avg of successful moves: 9.094, loss 6.796875
CNN made 2521 moves. Successful were 157
Games: 31500, last 500 games avg score: 58.674, avg of successful moves: 9.2, loss 11.296875
CNN made 2440 moves. Successful were 146
Games: 32000, last 500 games avg score: 59.18, avg of successful moves: 8.986, loss 6.09375
CNN made 2564 moves. Successful were 135
Games: 32500, last 500 games avg score: 59.81, avg of successful moves: 9.214, loss 4.140625
CNN made 2496 moves. Successful were 161
Games: 33000, last 500 games avg score: 60.91, avg of successful moves: 9.36, loss 1.9375
CNN made 2486 moves. Successful were 137
Games: 33500, last 500 games avg score: 60.996, avg of successful 

Games: 60500, last 500 games avg score: 60.968, avg of successful moves: 9.402, loss 13.46875
CNN made 2497 moves. Successful were 150
Games: 61000, last 500 games avg score: 58.318, avg of successful moves: 9.054, loss 6.8125
CNN made 2544 moves. Successful were 152
Games: 61500, last 500 games avg score: 58.974, avg of successful moves: 9.274, loss 8.515625
CNN made 2398 moves. Successful were 139
Games: 62000, last 500 games avg score: 57.956, avg of successful moves: 9.068, loss 50.5
CNN made 2456 moves. Successful were 139
Games: 62500, last 500 games avg score: 59.756, avg of successful moves: 9.136, loss 13.296875
CNN made 2491 moves. Successful were 147
New maximum: 246, after 62638 games.
Games: 63000, last 500 games avg score: 59.394, avg of successful moves: 9.152, loss 7.1875
CNN made 2459 moves. Successful were 124
Games: 63500, last 500 games avg score: 59.03, avg of successful moves: 9.0, loss 2.875
CNN made 2427 moves. Successful were 134
Games: 64000, last 500 games av

In [None]:
# tensorboard --logdir=./logs --bind_all &

print(MAXIMUM_SCORE)

## Test part

In [None]:
#field = np.zeros((7, 6))

df = pd.DataFrame(updated_field)
qgrid_widget = qgrid.show_grid(df, show_toolbar=True)
qgrid_widget

In [None]:
updated_df = qgrid_widget.get_changed_df()
updated_field = updated_df.values
visualize_field(updated_field)

In [None]:
get_sets(updated_field)

In [None]:
score, new_field = calculate_score_v2(updated_field, (6, 3), (5, 3))
print(score)
visualize_field(updated_field)

In [None]:
#updated_field_2 = fill_field(updated_field, colors)
#visualize_field(updated_field_2)

In [None]:
def loss(model, X, y_true, A):
    # Модель предсказывает награду для каждого из 142 ходов
    prediction = model(X)
    
    # Выбираем предсказания только для реально сделанных ходов
    selected_action_values = tf.math.reduce_sum(prediction*A, axis=1) 
    
    
    return tf.keras.losses.MSE(y_true, selected_action_values)


def grad(model, inputs, targets, actions):
    with tf.GradientTape() as tape:
        loss_value = loss(model, inputs, targets, actions)
    return loss_value, tape.gradient(loss_value, model.trainable_variables)



# Моедль предсказывает результаты как бы следующего хода за текущим
rewards_next = np.max(Target_CNN(s_after), axis=1)

actual_values = np.where(dones, rewards, rewards + GAMMA*rewards_next)

# Кодируем только реально сделанные сейчас ходы
selected_actions = tf.one_hot(actions, ae.ACTIONS_DIMENSION)


# Update online CNN weights: training step
for _ in range(DYNAMIC_LEARNING_EPOCHS):             
    loss_value, grads = grad(Online_CNN, s_before, actual_values, selected_actions)
    optimizer.apply_gradients(zip(grads, Online_CNN.trainable_variables))

In [60]:
actual_values.shape

(64,)

In [59]:
tf.one_hot(actions - 1, ae.ACTIONS_DIMENSION)[0]

<tf.Tensor: shape=(142,), dtype=float32, numpy=
array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.], dtype=float32)>

In [None]:
new_field = make_move(field, move)
print(new_field)

In [None]:
temp_field = calculate_score(new_field)

In [None]:
new_field = np.multiply(new_field, 1.0 - temp_field)

In [None]:
fill_field(new_field, colors)

In [30]:
??ae