**IMPORTS**

In [39]:
import aux_functions
import importlib

importlib.reload(aux_functions)
from aux_functions import *


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
import torch
#from torchvision import datasets, transforms
from torch.utils.data import DataLoader, ConcatDataset, SubsetRandomSampler
# For masking
from torch.masked import masked_tensor

import numpy as np
import chess
from datetime import datetime
import sklearn
from sklearn.model_selection import KFold

**DATA PROCESSING**

- Importing the pgn data
- Transforming the data to sparce tensors 
- Splitting the data into training and testing

In [40]:
TEST_PERCENT = 0.25

# Load pgn paths
pgns = import_data(1)

# Convert pgns to tensors
board_tensors, next_moves = parse_pgn_to_tensors(pgns)

# Converting the dataset into a custom pytorch one
dataset = ChessDataset(board_tensors, next_moves)

torch.manual_seed(0)
# Splitting the data into train and test
train_dataset, test_data = torch.utils.data.random_split(dataset, [1-TEST_PERCENT, TEST_PERCENT])

print(len(test_data))  # Number of states
print(train_dataset.indices)

147
[509, 580, 420, 75, 519, 102, 373, 494, 278, 366, 383, 97, 348, 56, 83, 359, 47, 341, 344, 260, 242, 590, 146, 141, 455, 340, 60, 524, 558, 311, 217, 563, 99, 237, 235, 100, 583, 176, 214, 487, 577, 84, 409, 208, 178, 13, 554, 514, 185, 552, 295, 440, 181, 400, 407, 575, 584, 553, 263, 110, 64, 397, 142, 561, 408, 42, 65, 115, 121, 559, 226, 276, 4, 14, 73, 560, 470, 331, 11, 68, 370, 51, 253, 316, 177, 254, 445, 53, 17, 513, 589, 174, 108, 169, 288, 399, 427, 8, 259, 36, 193, 127, 157, 106, 488, 255, 234, 398, 551, 35, 82, 475, 390, 285, 145, 385, 585, 339, 175, 67, 159, 484, 508, 292, 451, 207, 250, 244, 120, 130, 422, 342, 229, 231, 143, 256, 301, 90, 456, 15, 432, 568, 305, 569, 95, 549, 34, 536, 482, 424, 567, 109, 522, 466, 134, 16, 467, 544, 265, 81, 550, 537, 395, 469, 122, 468, 372, 86, 43, 191, 379, 57, 160, 303, 306, 201, 542, 299, 528, 417, 116, 349, 382, 393, 94, 22, 539, 118, 461, 435, 168, 0, 71, 196, 165, 571, 418, 586, 345, 351, 576, 389, 156, 273, 471, 352, 79, 55

**NEURAL NETWORK DESIGN**
- 2 Convolutional layers
- 2 Fully connected hidden layers

In [41]:
# Whether to do the operations on the cpu or gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class PieceToMoveNet(nn.Module):
    def __init__(self):
        super().__init__()

        # Takes as input a tensor of 14 channels (8x8 board)
        self.conv1 = nn.Conv2d(14, 6, 3)  # 6 filters, 3x3 kernel
        self.pool = nn.MaxPool2d(2, 2)    # Max pooling with 2x2 window
        self.conv2 = nn.Conv2d(6, 16, 3)  # 16 filters, 3x3 kernel

        # Using droput to reduce overfitting
        self.dropout = nn.Dropout(p=0.3)
        # Using batch normalization to make training faster and more stable
        self.bn1 = nn.BatchNorm1d(120)  # For the 1st layer
        self.bn2 = nn.BatchNorm1d(84)   # For the 2nd layer
        
        # Output from conv2 will be (16 channels, 1x1 feature maps)
        self.fc1 = nn.Linear(16 * 1 * 1, 120)
        self.fc2 = nn.Linear(120, 84)
        # Predicts the tile to move the piece from (64 possible tiles on the board)
        self.fc3 = nn.Linear(84, 64)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # Apply first conv + pooling
        x = F.relu(self.conv2(x))             # Apply second conv to get (16 x 1 x 1)
        x = torch.flatten(x, 1)               # Flatten all dimensions except batch size
        x = F.relu(self.bn1(self.fc1(x)))     # Fully connected layer 1 and batch normalization
        x = self.dropout(x)                   # Dropout of some first layer neurons
        x = F.relu(self.bn2(self.fc2(x)))     # Fully connected layer 2
        x = self.fc3(x)                       # Output layer (no activation, logits for classification)
        return x


# Initializing the network
piece_to_move_net = PieceToMoveNet()
# Move the network to gpu/cpu befor initializing the optimizer
piece_to_move_net.to(device)

# Adam optimizer will be used due to its versatility
optimizer = optim.Adam(piece_to_move_net.parameters(), lr=1e-4, weight_decay=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()

**TRAINING LOOP**

In [45]:
def generate_mask(tensor)-> list:
    """Generates a mask which contains the position of the pieces that can move"""

    # Contain a 2D representation of the board
    # board_mask = torch.zeros((8,8))

    # If layer 12 has any 1 it will be whites turn
    if  torch.any(tensor[12] == 1):
        # White pieces are in layers 0 to 5, apply a mask which will be 1 when there is a one
        mask = torch.sum(tensor[0:6], dim = 0) # for summing across the layers


    # If layer 13 has any 1 it will be blacks turn
    elif torch.any(tensor[13] == 1):
        # Black pieces are in range 6 to 11
        # Apply a mask, if there is a piece it will be a 1
        mask = torch.sum(tensor[6:12], dim = 0)


    # A position wion zero value means a piece is present
    # board_mask[mask > 0] = 1

    # Flatten the board to a 1D array
    # board_mask = board_mask.flatten().tolist()

    return mask.flatten().tolist()



In [46]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")


def train_epoch(model, optimizer, train_loader, loss_fn, train_sampler_size): 
    """
    Trains the model for one epoch and returns the average training loss and accuracy.
    """

    running_loss = 0.  
    running_correct = 0.

    # Looping through all samples in a batch
    for i, data in enumerate(train_loader):
        # Extracting the board tensor
        inputs = data[0]
        # Extracting the tile of the piece to move
        labels = data[1]

        # Resetting the gradients
        optimizer.zero_grad()

        # Calculating model's output
        mask = [generate_mask(pos) for pos in inputs]
        mask = torch.tensor(mask)
        mask = mask.to(device)

        """print(inputs[0])
        print(labels[0])
        print(mask[0])"""

        # Batch size 
        for j in range(len(mask)):
            for h in range(len(labels)):
                piece = mask[j, labels[h]]
                if piece == 0:
                    print(inputs[j])
                    print(labels[h])
                    print(mask[j])


        # Moving inputs and labels to the gpu/cpu
        inputs = inputs.to(device)
        labels = labels.to(device)

        logits = model(inputs)
        outputs = logits * mask.float()

        # Calculating the sample loss
        loss = loss_fn(outputs, labels)

        # Calculating the gradient
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Updating model parameters
        optimizer.step()

        # Adding the last loss to the running loss
        running_loss += loss.item()

        # Calculate number of correct predictions
        _, predictions = torch.max(outputs.data, 1)
        running_correct += (predictions == labels).sum().item()

    # Averaging the loss for all samples in the batch
    running_loss /= (i + 1)

    # Calculate accuracy based on the total samples in the fold (train_sampler_size)
    train_accuracy = running_correct / train_sampler_size

    return running_loss, train_accuracy


def validation_epoch(model, validation_loader, loss_fn, val_sampler_size):
    """
    Validates the model for one epoch and returns the average validation loss and accuracy.
    """

    running_vloss = 0.
    running_vcorrect = 0.

    # Set model to evaluation mode
    model.eval()

    # Disable gradient calculations for validation set
    with torch.no_grad():
        # Looping through all batches in the validation set
        for i, v_data in enumerate(validation_loader):
            # Getting the tensors of the validation data
            vinputs = v_data[0]
            vlabels = v_data[1] 

            # Calculating the output of the model
            mask = [generate_mask(pos) for pos in vinputs]
            mask = torch.tensor(mask)
            mask = mask.to(device)

            # Moving inputs and labels to the gpu/cpu
            vinputs = vinputs.to(device)
            vlabels = vlabels.to(device)

            logits = model(vinputs)
            voutputs = logits * mask.float()

            # Calculating the loss of the model in the validation sample
            vloss = loss_fn(voutputs, vlabels)
            # Adding this sample's loss to the total loss
            running_vloss += vloss.item()

            # Calculate number of correct predictions
            _, predictions = torch.max(voutputs.data, 1)
            running_vcorrect += (predictions == vlabels).sum().item()

    # Averaging the loss for all samples in the validation set
    running_vloss /= (i + 1)

    # Calculate accuracy based on the total samples in the fold (val_sampler_size)
    validation_accuracy = running_vcorrect / val_sampler_size

    return running_vloss, validation_accuracy


def train_multiple_folds(n_epochs, batch_size, model, splits, writer, optimizer, loss_fn):

    best_fold_vloss = 1_000

    for fold, (train_idx, val_idx) in enumerate(splits.split(np.arange(len(train_dataset)))):
        print(f"FOLD {fold+1}")

        avg_tloss = 0.
        avg_tacc = 0.
        avg_vloss = 0.
        avg_vacc = 0.

        train_sampler = SubsetRandomSampler(train_idx)
        val_sampler = SubsetRandomSampler(val_idx)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=train_sampler)
        val_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=val_sampler)

        model.to(device)

        train_sampler_size = len(train_sampler)
        val_sampler_size = len(val_sampler)

        for epoch in range(n_epochs): 
            train_loss, train_correct = train_epoch(piece_to_move_net, optimizer, train_loader, loss_fn, train_sampler_size)
            val_loss, val_correct = validation_epoch(piece_to_move_net, val_loader, loss_fn, val_sampler_size)

            avg_tloss += train_loss
            avg_tacc += train_correct
            avg_vloss += val_loss
            avg_vacc += val_correct

            print(f"Epoch: {epoch} Train Loss: {train_loss}, Valid Loss: {val_loss} | Train Acc: {train_correct}, Valid Acc: {val_correct}")


        avg_tloss /= (epoch + 1)
        avg_tacc /= (epoch + 1)
        avg_vloss /= (epoch + 1)
        avg_vacc /= (epoch + 1)

        # Adding insights
        writer.add_scalars("Loss", {"Training": avg_tloss, "Validation": avg_vloss}, fold + 1)
        writer.add_scalars("Accuracy", {"Training": avg_tacc, "Validation": avg_vacc}, fold + 1)
        writer.flush()

        # Saving the model if the loss on the validation is lower than the best one
        if avg_vloss < best_fold_vloss:
            best_fold_vloss = avg_vloss
            model_path = f"models/piece_to_move_net_{timestamp}_{fold}"
            torch.save(model.state_dict(), model_path)


EPOCHS = 10
BATCH_SIZE = 32
K = 5
# Logs training statistics for TensorBoard visualization
writer = SummaryWriter(f"runs/piece_to_move_{timestamp}")  
splits = KFold(n_splits=K, shuffle=True, random_state=42)

train_multiple_folds(EPOCHS, BATCH_SIZE, piece_to_move_net, splits, writer, optimizer, loss_fn)


FOLD 1


UnboundLocalError: cannot access local variable 'mask' where it is not associated with a value