**IMPORTS**

In [25]:
import aux_functions
import importlib

importlib.reload(aux_functions)
from aux_functions import *


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import torch
#from torchvision import datasets, transforms
from torch.utils.data import DataLoader, ConcatDataset, SubsetRandomSampler
# For masking
from torch.masked import masked_tensor

import numpy as np
import chess
from datetime import datetime
import sklearn
from sklearn.model_selection import KFold

**DATA PROCESSING**

- Importing the pgn data
- Transforming the data to sparce tensors 
- Splitting the data into training and testing

In [26]:
TEST_PERCENT = 0.25

# Load pgn paths
pgns = import_data(5)

# Convert pgns to tensors
board_tensors, next_moves = parse_pgn_to_tensors(pgns)

# Converting the dataset into a custom pytorch one
dataset = ChessDataset(board_tensors, next_moves)

torch.manual_seed(0)
# Splitting the data into train and test
train_dataset, test_data = torch.utils.data.random_split(dataset, [1-TEST_PERCENT, TEST_PERCENT])

print(len(test_data))  # Number of states
print(train_dataset.indices)

845
[414, 235, 1355, 1009, 437, 1064, 2105, 260, 2175, 81, 291, 79, 728, 2653, 2508, 1510, 2518, 2965, 2428, 1621, 2124, 763, 1293, 797, 1454, 1299, 443, 686, 967, 2616, 231, 283, 2045, 49, 2573, 717, 2412, 2931, 1205, 376, 678, 3326, 2258, 986, 1175, 861, 929, 1622, 3254, 3260, 3038, 408, 705, 1465, 949, 1752, 3331, 3113, 222, 365, 879, 128, 3065, 1309, 1241, 3233, 284, 2849, 2422, 2786, 1131, 1511, 92, 1568, 114, 2332, 2917, 1498, 1287, 1329, 126, 2037, 1326, 1383, 995, 1641, 3007, 3150, 293, 671, 2797, 206, 974, 3096, 2249, 229, 2462, 4, 2707, 1108, 35, 1747, 3098, 3055, 442, 1006, 2830, 2053, 393, 2667, 2633, 883, 1056, 1983, 1274, 1399, 3317, 100, 2969, 1744, 3337, 62, 1575, 2861, 1433, 715, 719, 2141, 2795, 2809, 446, 2584, 2959, 2831, 470, 1093, 2034, 381, 2264, 1448, 683, 1388, 1432, 1123, 1047, 941, 2257, 976, 959, 699, 790, 767, 2064, 2994, 2568, 2367, 833, 3168, 1400, 1325, 2240, 277, 2949, 1449, 2513, 3061, 1336, 1401, 3335, 290, 3339, 1367, 2817, 3114, 1415, 787, 2087, 230

**NEURAL NETWORK DESIGN**
- 2 Convolutional layers
- 2 Fully connected hidden layers

In [27]:
# Whether to do the operations on the cpu or gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class PieceToMoveNet(nn.Module):
    def __init__(self):
        super().__init__()

        # Takes as input a tensor of 14 channels (8x8 board)
        self.conv1 = nn.Conv2d(14, 6, 3)  # 6 filters, 3x3 kernel
        # self.pool = nn.MaxPool2d(2, 2)    # Max pooling with 2x2 window
        self.conv2 = nn.Conv2d(6, 16, 3)  # 16 filters, 3x3 kernel

        # Using droput to reduce overfitting
        self.dropout = nn.Dropout(p=0.3)
        # Using batch normalization to make training faster and more stable
        self.bn1 = nn.BatchNorm1d(240)  # For the 1st layer
        self.bn2 = nn.BatchNorm1d(120)   # For the 2nd layer
        
        # Output from conv2 will be (16 channels, 1x1 feature maps)
        self.fc1 = nn.Linear(16 * 1 * 1, 240)
        self.fc2 = nn.Linear(240, 120)
        # Predicts the tile to move the piece from (64 possible tiles on the board)
        self.fc3 = nn.Linear(120, 64)

    def forward(self, x):
        # x = self.pool(F.relu(self.conv1(x)))  # Apply first conv + pooling
        x = F.reul(self.conv1(x))
        x = F.relu(self.conv2(x))             # Apply second conv to get (16 x 1 x 1)
        x = torch.flatten(x, 1)               # Flatten all dimensions except batch size
        x = F.relu(self.bn1(self.fc1(x)))     # Fully connected layer 1 and batch normalization
        x = self.dropout(x)                   # Dropout of some first layer neurons
        x = F.relu(self.bn2(self.fc2(x)))     # Fully connected layer 2
        x = self.fc3(x)                       # Output layer (no activation, logits for classification)
        return x


# Initializing the network
piece_to_move_net = PieceToMoveNet()
# Move the network to gpu/cpu befor initializing the optimizer
piece_to_move_net.to(device)

# Adam optimizer will be used due to its versatility
optimizer = optim.Adam(piece_to_move_net.parameters(), lr=1e-4, weight_decay=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

**TRAINING LOOP**

In [28]:
def generate_mask(tensor)-> list:
    """Generates a mask which contains the position of the pieces that can move"""

    # Contain a 2D representation of the board
    # board_mask = torch.zeros((8,8))

    # If layer 12 has any 1 it will be whites turn
    if  torch.any(tensor[12] == 1):
        # White pieces are in layers 0 to 5, apply a mask which will be 1 when there is a one
        mask = torch.sum(tensor[0:6], dim = 0).int() # for summing across the layers


    # If layer 13 has any 1 it will be blacks turn
    elif torch.any(tensor[13] == 1):
        # Black pieces are in range 6 to 11
        # Apply a mask, if there is a piece it will be a 1
        mask = torch.sum(tensor[6:12], dim = 0).int()

    # A position wion zero value means a piece is present
    # board_mask[mask > 0] = 1

    # Flatten the board to a 1D array
    # board_mask = board_mask.flatten().tolist()

    return mask.flatten().tolist()



In [29]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")


def train_epoch(model, optimizer, train_loader, loss_fn, train_sampler_size): 
    """
    Trains the model for one epoch and returns the average training loss and accuracy.
    """

    running_loss = 0.  
    running_correct = 0.

    # Looping through all samples in a batch
    for i, data in enumerate(train_loader):
        # Extracting the board tensor
        inputs = data[0]
        # Extracting the tile of the piece to move
        labels = data[1]

        # Resetting the gradients
        optimizer.zero_grad()

        # Calculating model's output
        mask = [generate_mask(pos) for pos in inputs]
        mask = torch.tensor(mask)
        mask = mask.to(device)

        """print(inputs[0])
        print(labels[0])
        print(mask[0])"""

        # Moving inputs and labels to the gpu/cpu
        inputs = inputs.to(device)
        labels = labels.to(device)

        logits = model(inputs)
        outputs = logits * mask.float()

        # Calculating the sample loss
        loss = loss_fn(outputs, labels)

        # Calculating the gradient
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Updating model parameters
        optimizer.step()

        # Adding the last loss to the running loss
        running_loss += loss.item()

        # Calculate number of correct predictions
        _, predictions = torch.max(outputs.data, 1)
        running_correct += (predictions == labels).sum().item()

    # Averaging the loss for all samples in the batch
    running_loss /= (i + 1)

    # Calculate accuracy based on the total samples in the fold (train_sampler_size)
    train_accuracy = running_correct / train_sampler_size

    return running_loss, train_accuracy


def validation_epoch(model, validation_loader, loss_fn, val_sampler_size):
    """
    Validates the model for one epoch and returns the average validation loss and accuracy.
    """

    running_vloss = 0.
    running_vcorrect = 0.

    # Set model to evaluation mode
    model.eval()

    # Disable gradient calculations for validation set
    with torch.no_grad():
        # Looping through all batches in the validation set
        for i, v_data in enumerate(validation_loader):
            # Getting the tensors of the validation data
            vinputs = v_data[0]
            vlabels = v_data[1] 

            # Calculating the output of the model
            mask = [generate_mask(pos) for pos in vinputs]
            mask = torch.tensor(mask)
            mask = mask.to(device)

            # Moving inputs and labels to the gpu/cpu
            vinputs = vinputs.to(device)
            vlabels = vlabels.to(device)

            logits = model(vinputs)
            voutputs = logits * mask.float()

            # Calculating the loss of the model in the validation sample
            vloss = loss_fn(voutputs, vlabels)
            # Adding this sample's loss to the total loss
            running_vloss += vloss.item()

            # Calculate number of correct predictions
            _, predictions = torch.max(voutputs.data, 1)
            running_vcorrect += (predictions == vlabels).sum().item()

    # Averaging the loss for all samples in the validation set
    running_vloss /= (i + 1)

    # Calculate accuracy based on the total samples in the fold (val_sampler_size)
    validation_accuracy = running_vcorrect / val_sampler_size

    return running_vloss, validation_accuracy


def train_multiple_folds(n_epochs, n_folds, batch_size, splits, writer, optimizer_class, optimizer_params, loss_fn):

    best_vloss = 1_000

    epochs_tloss = [0 for _ in range(n_epochs)]
    epochs_tacc = [0 for _ in range(n_epochs)]
    epochs_vloss = [0 for _ in range(n_epochs)]
    epochs_vacc = [0 for _ in range(n_epochs)]

    for fold, (train_idx, val_idx) in enumerate(splits.split(np.arange(len(train_dataset)))):
        print(f"FOLD {fold+1}")

        train_sampler = SubsetRandomSampler(train_idx)
        val_sampler = SubsetRandomSampler(val_idx)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
        val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=val_sampler)

        model = PieceToMoveNet()

        optimizer = optimizer_class(model.parameters(), **optimizer_params)
        model.to(device)

        train_sampler_size = len(train_sampler)
        val_sampler_size = len(val_sampler)
        avg_vloss = 0.

        for epoch in range(n_epochs): 
            train_loss, train_correct = train_epoch(model, optimizer, train_loader, loss_fn, train_sampler_size)
            val_loss, val_correct = validation_epoch(model, val_loader, loss_fn, val_sampler_size)

            avg_vloss += val_loss
        
            epochs_tloss[epoch] += train_loss
            epochs_tacc[epoch] += train_correct
            epochs_vloss[epoch] += val_loss
            epochs_vacc[epoch] += val_correct

            print(f"Epoch: {epoch} Train Loss: {train_loss}, Valid Loss: {val_loss} | Train Acc: {train_correct}, Valid Acc: {val_correct}")

        avg_vloss /= (epoch + 1)

        # Saving the model if the loss on the validation is lower than the best one
        if avg_vloss < best_vloss:
            best_vloss = avg_vloss
            model_path = f"models/piece_to_move_net_{timestamp}_{fold}"
            torch.save(model.state_dict(), model_path)

    for i in range(n_epochs):
        epochs_tloss[i] /= (n_folds)
        epochs_tacc[i] /= (n_folds)
        epochs_vloss[i] /= (n_folds)
        epochs_vacc[i] /= (n_folds)

    for i in range(n_epochs):
        # Adding insights
        writer.add_scalars("Loss", {"Training": epochs_tloss[i], "Validation": epochs_vloss[i]}, i + 1)
        writer.add_scalars("Accuracy", {"Training": epochs_tacc[i], "Validation": epochs_vacc[i]}, i + 1)
        writer.flush()


In [30]:
EPOCHS = 25
BATCH_SIZE = 32
K = 3
# Logs training statistics for TensorBoard visualization
writer = SummaryWriter(f"runs/piece_to_move_{timestamp}")  
splits = KFold(n_splits=K, shuffle=True, random_state=42)

optimizer_class = optim.Adam
optimizer_params = {
    "lr": 1e-4,
    "weight_decay": 1e-5
}

train_multiple_folds(EPOCHS, K, BATCH_SIZE, splits, writer, optimizer_class, optimizer_params, loss_fn)

FOLD 1
Epoch: 0 Train Loss: 4.110798169981758, Valid Loss: 4.131084865993923 | Train Acc: 0.10999408633944412, Valid Acc: 0.0851063829787234
Epoch: 1 Train Loss: 4.054035785063258, Valid Loss: 3.999702533086141 | Train Acc: 0.11945594322885866, Valid Acc: 0.12293144208037825
Epoch: 2 Train Loss: 3.87433042166368, Valid Loss: 3.788986320848818 | Train Acc: 0.15493790656416323, Valid Acc: 0.1453900709219858
Epoch: 3 Train Loss: 3.5317449254809685, Valid Loss: 3.3594006609033653 | Train Acc: 0.17031342400946187, Valid Acc: 0.17257683215130024
Epoch: 4 Train Loss: 3.0377993223802098, Valid Loss: 2.89467273818122 | Train Acc: 0.1862803075103489, Valid Acc: 0.18085106382978725
Epoch: 5 Train Loss: 2.6316502679069087, Valid Loss: 2.54697377593429 | Train Acc: 0.19396806623299823, Valid Acc: 0.18439716312056736
Epoch: 6 Train Loss: 2.407414305884883, Valid Loss: 2.406557701252125 | Train Acc: 0.20697811945594322, Valid Acc: 0.20212765957446807
Epoch: 7 Train Loss: 2.312703146124786, Valid Loss