In [None]:
import pandas as pd
import numpy as np
import os

**Spliting the 13 million lines file in smaller files**

In [None]:

def split_csv(input_file, output_folder, num_files):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    df = pd.read_csv(input_file)

    chunk_size = len(df) // num_files
    chunks = [df.iloc[i*chunk_size:(i+1)*chunk_size] for i in range(num_files)]
    
    for i, chunk in enumerate(chunks):
        output_file = os.path.join(output_folder, f'split{i+1}.csv')
        chunk.to_csv(output_file, index=False)
        print(f'Chunk {i+1} written to {output_file}')

**Data pre-processing**

**Converting FEN to 1D Integer Array**

In [None]:
def castling_rights_to_integer(fen_castling):
    rights = 0
    if 'K' in fen_castling:
        rights |= 0b1000
    if 'Q' in fen_castling:
        rights |= 0b0100
    if 'k' in fen_castling:
        rights |= 0b0010
    if 'q' in fen_castling:
        rights |= 0b0001
    return rights

def en_passant_to_integer(fen_en_passant):
    if fen_en_passant == '-':
        return -1
    
    rn_passant = 0
    
    if 'a' in fen_en_passant:
        rn_passant != 0b0000
    elif 'b' in fen_en_passant:
        rn_passant |= 0b0001
    elif 'c' in fen_en_passant:
        rn_passant |= 0b0010
    elif 'd' in fen_en_passant:
        rn_passant |= 0b0011
    elif 'e' in fen_en_passant:
        rn_passant |= 0b0100
    elif 'f' in fen_en_passant:
        rn_passant |= 0b0101
    elif 'g' in fen_en_passant:
        rn_passant |= 0b0110
    elif 'h' in fen_en_passant:
        rn_passant |= 0b0111
    
    if '3' in fen_en_passant:
        rn_passant |= 0b1000
    elif '6' in fen_en_passant:
        rn_passant |= 0b0000
        
    return rn_passant

def fen_to_encoded(fen):
    piece_mapping = {
    'p': 1, 'P': -1,  # pawn
    'n': 2, 'N': -2,  # knight
    'b': 3, 'B': -3,  # bishop
    'r': 4, 'R': -4,  # rook
    'q': 5, 'Q': -5,  # queen
    'k': 6, 'K': -6,  # king
    '.': 0  # empty square
    
    }
    board_state, turn, castling, en_passant, halfmove_clock, fullmove_number = fen.split()

    encoded_board = np.full((8, 8), 0, dtype=int)
    rank_index = 0
    file_index = 0

    for char in board_state:
        if char == '/':
            rank_index += 1
            file_index = 0
        elif char.isdigit():
            file_index += int(char)
        else:
            piece_index = piece_mapping[char]
            encoded_board[rank_index, file_index] = piece_index
            file_index += 1

    encoded_board = encoded_board.reshape((1,(encoded_board.shape[0]*encoded_board.shape [1])))
    encoded_board = np.append(encoded_board, 1 if turn == "w" else -1)
    encoded_board = np.append(encoded_board, castling_rights_to_integer(castling))
    encoded_board = np.append(encoded_board, en_passant_to_integer(en_passant))
    encoded_board = np.append(encoded_board, np.array([halfmove_clock,fullmove_number]))
    return encoded_board.astype(int)
    

 **Load Dataset**

In [None]:
def load_dataset(filepath):
    chess_data = pd.read_csv(filepath)
    #random_data = pd.read_csv('data/random_evals.csv')
    #tactic_data = pd.read_csv('data/tactic_evals.csv')

    print(chess_data.shape)
    #print(random_data.shape)
    #print(tactic_data.shape)

    X_chess_data = chess_data["FEN"]
    y_chess_data = chess_data["Evaluation"]
    
    return X_chess_data,y_chess_data

**Replacing checkmate evaluations but minimum and maximum possible evaluations**

In [None]:
import re

def process_evaluations(y_chess_data):
    # Replace elements containing '#-' with '-7658'
    y_chess_data = np.where("#-" in y_chess_data, "-7658", y_chess_data)

    # Replace elements containing '#+' with '7881'
    y_chess_data = np.where("#+" in y_chess_data, "7881", y_chess_data)

    # Remove non-digit characters and convert to integers
    y_chess_data = [int(''.join(re.findall(r'-?\d+', s))) for s in y_chess_data]

    return y_chess_data


**Applying the FEN converting functions**

In [None]:


def process_row(row):
    encoded_data = fen_to_encoded(row)
    return pd.Series(encoded_data)


def process_FEN(X_chess_data):
    return X_chess_data.apply(process_row)



**Saving the data into a file**

In [None]:
def save_data_into_file(X_chess_data_processed,y_chess_data_processed,output_path):
    x_file = X_chess_data_processed.reset_index(drop=True)
    y_file = pd.DataFrame(y_chess_data_processed).reset_index(drop=True)

    to_file = pd.concat([x_file, y_file], axis=1, ignore_index=False)

    column_names = ['FEN' for i in range(69)] + ['Evaluation']

    to_file.columns = column_names

    to_file.to_csv(output_path, index=False, header=True)

In [None]:
import gc
    
input_file = 'data/chessData.csv'
output_folder_for_split_files = 'split_files'
output_folder = 'processed_data'
num_split_files = 13

split_csv(input_file, output_folder_for_split_files, num_split_files)

for filename in os.listdir(output_folder_for_split_files):
    gc.collect()

    filepath = os.path.join(output_folder_for_split_files,filename)
    
    X_chess_data,y_chess_data = load_dataset(filepath)
    
    X_chess_data_processed = process_FEN(X_chess_data)
    y_chess_data_processed = process_evaluations(y_chess_data)
    
    output_path = os.path.join(output_folder,filename)
    save_data_into_file(X_chess_data_processed,y_chess_data_processed,output_path)
    