### Chess modeling dataset EDA

In [28]:
# How many different tasks there are and what are their counts?
import json
from collections import Counter
from tqdm import tqdm

# List your 8 jsonl file names here
jsonl_files = [
    'chess_modeling-data.jsonl-00000-of-00008',
    'chess_modeling-data.jsonl-00001-of-00008',
    'chess_modeling-data.jsonl-00002-of-00008',
    'chess_modeling-data.jsonl-00003-of-00008',
    'chess_modeling-data.jsonl-00004-of-00008',
    'chess_modeling-data.jsonl-00005-of-00008',
    'chess_modeling-data.jsonl-00006-of-00008',
    'chess_modeling-data.jsonl-00007-of-00008'
]

# Set to store unique descriptions
unique_descriptions_cnt = Counter()

for filename in tqdm(jsonl_files):
    with open(filename, 'r') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                data = json.loads(line)
                # Extract the description from metadata (if available)
                description = data.get("metadata", {}).get("description", "")
                unique_descriptions_cnt[description] += 1

print("Number of unique descriptions:", unique_descriptions_cnt)
print("Total number of samples: ", sum(unique_descriptions_cnt.values()))

100%|██████████| 8/8 [00:07<00:00,  1.14it/s]

Number of unique descriptions: Counter({'Generate FEN given PGN': 293146, 'Generate FEN given UCI-format move list': 286832, 'Generate all legal moves in UCI format given the board PGN': 148308, 'Generate all legal moves in SAN format given the board PGN': 148301, 'Generate SAN move given UCI-format move and FEN': 147654, 'Generate UCI move given SAN-format move and FEN': 147649, 'Generate next FEN given FEN and SAN format move': 147629, 'Generate next FEN given FEN and UCI format move': 147589, 'Draw chess board given FEN': 146980, 'Generate all legal moves in SAN format given the board FEN': 143764, 'Generate all legal moves given a board FEN': 143694})
Total number of samples:  1901546





### Per question-type Chess modeling dataset preprocessing

In [27]:
# 1. Generate FEN given PGN
example = \
{
"metadata": 
    {"description": "Generate FEN given PGN"}, 
"text": 
    "Generate the FEN representation given the PGN of chess game: 1. e4 g6 2. d4 Bg7 3. c4 c5 4. Nc3 cxd4 5. Nd5 Nc6 6. Nf3 e6 7. Nf4 e5 8. Nd5 h6 9. Bd3 Nge7 10. O-O O-O 11. Bd2 d6 12. Qc1 Kh7 13. Ne1. The FEN is r1bq1r2/pp2npbk/2np2pp/3Np3/2PpP3/3B4/PP1B1PPP/R1Q1NRK1 b - - 3 13.", 
"pipeline_key": 
    "function_puzzle_v2.jsonl.zst-1122107"
}

def generate_fen_given_pgn(text: str):
    text = example['text']
    sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]

    input_text = '. '.join(sentences[:-1])
    input_text += '.'
    last_sentence = sentences[-1]
    last_sentence += '.'

    assert text == input_text + " " + last_sentence
    
    return input_text, last_sentence


In [30]:
import chess

# 2. Generate FEN given UCI-format move list
# Remarks: Changed it so that we generate "SAN" notations, not UCI notations
example = \
{
"metadata": 
    {"description": 
        "Generate FEN given UCI-format move list"}, 
"text": 
    "It would be great if you could produce the FEN representation of the chess game based on the provided SAN based move list: e2e4 e7e5 d2d4 e5d4 c2c3 b8c6 f1c4 g8f6 d1f3 d7d6 g1e2 c8g4 f3d3 g4e2 d3e2 f8e7 e1g1 e8g8 c4d3 c6e5 c3d4 e5d3 e2d3 f8e8 b1c3 c7c6 f1e1 d6d5 e4e5 f6d7 d3g3 e7d6 c1h6 d6e5 d4e5 g7g6 h6g5 d8b6 h2h4 b6b2 a1b1 b2c2 e1e2 c2f5 b1b7 d7e5 f2f4 e5g4 e2e8. The FEN code obtained is r3R1k1/pR3p1p/2p3p1/3p1qB1/5PnP/2N3Q1/P5P1/6K1 b - - 0 25.", 
"pipeline_key": 
    "function_puzzle_v2.jsonl.zst-1056409"}

def convert_uci_to_san_type2(uci_moves_str):
    """
    Converts a space-separated string of UCI moves into a list of SAN moves.
    
    Parameters:
        uci_moves_str (str): A string of UCI moves separated by spaces.
        
    Returns:
        list: A list of moves in SAN notation.
    """
    board = chess.Board()  # Start with the initial board position
    uci_moves = uci_moves_str.split()
    san_moves = []
    for move_str in uci_moves:
        move = chess.Move.from_uci(move_str)
        # Convert the move to SAN notation based on the current board state
        san = board.san(move)
        san_moves.append(san)
        board.push(move)  # Update the board with the move
    return san_moves

def generate_fen_given_san_move_list(text: str):
    """
    Processes the provided text by:
      - Checking for a colon in the first sentence to extract and convert the UCI move list to SAN notation.
      - Using all but the last sentence as the human input.
      - Using the final sentence as the model response.
      
    If the text contains only one sentence, it is assumed to be the model response and the human input is left empty.
    
    Parameters:
        text (str): The input text containing the prompt with UCI moves and the FEN output.
    
    Returns:
        tuple: (human_input, model_response)
    """
    # Split the text into sentences (filtering out any empty fragments)
    sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]
    if not sentences:
        return "", ""
    
    # Process the first sentence: if it contains a colon, extract the UCI move list.
    first_sentence = sentences[0]
    if ':' in first_sentence:
        colon_index = first_sentence.find(':')
        prefix = first_sentence[:colon_index+1]  # Up to and including the colon
        uci_moves_str = first_sentence[colon_index+1:].strip()
        san_moves = convert_uci_to_san_type2(uci_moves_str)
        san_moves_str = " ".join(san_moves)
        new_first_sentence = f"{prefix} {san_moves_str}"
    else:
        new_first_sentence = first_sentence
    
    # If there's only one sentence, treat it as the model response.
    if len(sentences) == 1:
        human_input = ""
        model_response = new_first_sentence
    else:
        # For multiple sentences: all but the last are human input.
        human_sentences = [new_first_sentence] + sentences[1:-1]
        human_input = ". ".join(human_sentences) + "."
        model_response = sentences[-1]
        if not model_response.endswith('.'):
            model_response += '.'
    
    return human_input, model_response

po, op = generate_fen_given_san_move_list(example['text'])

In [2]:
import chess
import re

# 3. Generate all legal moves in UCI format given the board PGN
# Remarks: Changed it so that we generate "SAN" notations, not UCI notations
example = \
{"metadata": {"description": "Generate all legal moves in UCI format given the board PGN"}, "text": "Can you generate all legal moves in UCI format for the provided PGN of the chess game: 1. d4 Nf6 2. Nf3 g6 3. c3 Bg7 4. g3 O-O 5. Bg2 d6 6. O-O Nc6 7. Nbd2 a6 8. e4 e5 9. d5 Ne7 10. Qc2 Nh5 11. b3 f5 12. exf5 gxf5 13. Bb2 e4 14. Nh4 Nxd5 15. c4 Ndf6 16. Rad1 Qe7 17. Nb1 Ng4 18. Bxg7 Qxg7 19. Nc3 Bd7 20. Nd5 Rac8 21. h3 Ne5 22. a4 Nf6 23. Nxf6+ Qxf6 24. Qc1 Nd3 25. Qe3 c6 26. Kh2 d5 27. Qb6 Rb8 28. cxd5 cxd5 29. Qc7 Bc6 30. Qa5 f4 31. Qb6 Qg5 32. Qc7 Rbe8 33. Bh1 Re7 34. Qb6 Rg7 35. Rg1 Nxf2 36. Qxf2 e3 37. Qe1 fxg3+ 38. Rxg3 Rf2+ 39. Qxf2 exf2 40. Rxg5 Rxg5 41. Rf1 Rh5 42. Nf3 taking into consideration that the king might be in check? The resulting moves are g8h8 g8f8 g8g7 g8f7 c6e8 c6d7 c6b5 c6a4 h5h6 h5g5 h5f5 h5e5 h5h4 h5h3 h7h6 b7b6 a6a5 d5d4 b7b5.", "pipeline_key": "function_puzzle_v2.jsonl.zst-1848827"}
#{"metadata": {"description": "Generate all legal moves in UCI format given the board PGN"}, "text": "Generate all legal moves in UCI format for the given PGN of chess game: 1. e4 d5 2. Nf3 c6 3. exd5 cxd5 4. d4 Nc6 5. c3 e6 6. Nbd2 Bd6 7. Nb3 Nge7 8. Bg5 h6 9. Be3 O-O 10. Bd3 b6 11. Qd2 Nf5 12. Bxf5 exf5 13. Bxh6 Qf6 14. Bg5 Qg6 15. h4 f6 16. Be3 Re8 17. O-O-O Ba6 18. h5 Qf7 19. Nh4 Bc8 20. Ng6 Qc7 21. h6 Kf7 22. hxg7 Kxg7. Note that it considers the king in check situation. The legal moves in UCI format are: e8h8 e8g8 e8f8 e8d8 e8e7 e8e6 e8e5 e8e4 e8e3 c8d7 c8b7 c8e6 c8a6 a8b8 f7g8 f7g7 f7g6 f7e6 c7d8 c7b8 c7e7 c7d7 c7b7 d6f8 d6e7 d6e5 d6c5 d6f4 d6b4 d6g3 d6a3 d6h2 c6d8 c6b8 c6e7 c6e5 c6a5 c6d4 c6b4 a7a6 b6b5 f5f4 a7a5.", "pipeline_key": "function_puzzle_v2.jsonl.zst-1834014"}
#
#{"metadata": {"description": "Generate all legal moves in UCI format given the board PGN"}, "text": "Can you please provide me with a list of all legal moves in UCI format for the given PGN of the chess game: 1. d4 Nf6 2. c4 e5 3. dxe5 Ng4 4. Nf3 Bc5 5. e3 Nc6 6. Be2 O-O 7. O-O Re8 8. Bd2 Ngxe5 9. Bc3 d6 10. Nbd2 Bf5 11. a3 a5 12. Nd4 Bg6 13. Nxc6 Nxc6 14. Nf3 a4 15. Bd3 Qe7 16. Bxg6 hxg6 17. Nd4 Ne5 18. Qe2 Qh4 19. Nf3 Qh5 20. Rfe1 Qg4 21. Nxe5 Qxe2 22. Rxe2 dxe5 23. Rd1 c6 24. Red2 f6 25. Kf1 while also considering the king's check condition? The resulting moves are c3e5 c3a5 c3d4 c3b4 d2d8 d2d7 d2d6 d2d5 d2d4 d2d3 d2e2 d2c2 g1h1 g1f1 d1f1 d1e1 d1c1 d1b1 d1a1 e3e4 h2h3 g2g3 f2f3 b2b3 h2h4 g2g4 f2f4 b2b4.", "pipeline_key": "function_puzzle_v2.jsonl.zst-1872281"}


def generate_all_legal_moves_in_uci(text: str):
    """
    Processes the provided text by:
      - Extracting the PGN from the prompt (everything after the first colon, up to an optional marker).
      - Removing move numbers and extra commentary.
      - Setting up the board position by playing the PGN moves in order.
      - Generating all legal moves from the final position in UCI notation.
      
    Returns:
        dict: An alpaca format dictionary with the instruction, input, and output.
    """
    # Split the text into sentences. We assume sentences end with a period, question mark, or exclamation mark.
    sentences = re.split(r'(?<=[.?!])\s+', text.strip())
    if len(sentences) < 2:
        raise ValueError("The text should contain at least two sentences.")
    
    # All sentences except the last are considered the input prompt.
    prompt_part = " ".join(sentences[:-1])
    # The final sentence is available as the expected result, but is not used further.
    expected_result_part = sentences[-1]
    
    # Extract the PGN notation from the prompt text
    # First, get the part after the colon which should contain the PGN
    if ":" not in prompt_part:
        raise ValueError("No colon found in prompt for PGN extraction.")
    
    pgn_str = prompt_part.split(":", 1)[1].strip()
    
    # Extract all numbered moves with a reliable regex
    # This pattern captures move pairs OR single moves with a number prefix
    # Handles cases like "1. e4 e5" as well as "42. Nf3"
    move_pattern = r'\d+\.\s*([A-Za-z0-9\-x=+#]+)(?:\s+([A-Za-z0-9\-x=+#]+))?'
    move_matches = re.findall(move_pattern, pgn_str)
    
    # Flatten the list of tuples and filter out empty strings
    pgn_moves = []
    for match in move_matches:
        for move in match:
            if move:  # Only add non-empty moves
                pgn_moves.append(move)
    
    # Print debug info to verify correct extraction
    print(f"Extracted {len(pgn_moves)} moves: {pgn_moves}, (Its from type3)")
    
    # Set up the board by playing the PGN moves.
    board = chess.Board()
    for move in pgn_moves:
        try:
            board.push_san(move)
        except Exception as e:
            print(f"Error processing PGN move '{move}': {e}")
    
    # Generate all legal moves from the final board position in UCI notation.
    legal_moves = list(board.legal_moves)
    san_moves = [board.san(move) for move in legal_moves]
    san_moves_str = " ".join(san_moves)
    
    # Build the model response.
    model_response = f"The resulting moves are {san_moves_str}."
    human_input = prompt_part
    return human_input, model_response


# Run the function with the example text
human_input, model_response = generate_all_legal_moves_in_uci(example['text'])
print("Human Input:")
print(human_input)
print("\nModel Response:")
print(model_response)

Extracted 84 moves: ['d4', 'Nf6', 'Nf3', 'g6', 'c3', 'Bg7', 'g3', 'O-O', 'Bg2', 'd6', 'O-O', 'Nc6', 'Nbd2', 'a6', 'e4', 'e5', 'd5', 'Ne7', 'Qc2', 'Nh5', 'b3', 'f5', 'exf5', 'gxf5', 'Bb2', 'e4', 'Nh4', 'Nxd5', 'c4', 'Ndf6', 'Rad1', 'Qe7', 'Nb1', 'Ng4', 'Bxg7', 'Qxg7', 'Nc3', 'Bd7', 'Nd5', 'Rac8', 'h3', 'Ne5', 'a4', 'Nf6', 'Nxf6+', 'Qxf6', 'Qc1', 'Nd3', 'Qe3', 'c6', 'Kh2', 'd5', 'Qb6', 'Rb8', 'cxd5', 'cxd5', 'Qc7', 'Bc6', 'Qa5', 'f4', 'Qb6', 'Qg5', 'Qc7', 'Rbe8', 'Bh1', 'Re7', 'Qb6', 'Rg7', 'Rg1', 'Nxf2', 'Qxf2', 'e3', 'Qe1', 'fxg3+', 'Rxg3', 'Rf2+', 'Qxf2', 'exf2', 'Rxg5', 'Rxg5', 'Rf1', 'Rh5', 'Nf3', 'taking']
Error processing PGN move 'taking': invalid san: 'taking'
Human Input:
Can you generate all legal moves in UCI format for the provided PGN of the chess game: 1. d4 Nf6 2. Nf3 g6 3. c3 Bg7 4. g3 O-O 5. Bg2 d6 6. O-O Nc6 7. Nbd2 a6 8. e4 e5 9. d5 Ne7 10. Qc2 Nh5 11. b3 f5 12. exf5 gxf5 13. Bb2 e4 14. Nh4 Nxd5 15. c4 Ndf6 16. Rad1 Qe7 17. Nb1 Ng4 18. Bxg7 Qxg7 19. Nc3 Bd7 20. Nd5 Ra

In [61]:
len(pgn_moves)

NameError: name 'pgn_moves' is not defined

### Full Chess modeling dataset preprocessing

In [12]:
import json

input_filename = "chess_modeling-data.jsonl-00000-of-00008"
output_filename= "chess_modeling_instruct.json"

alpaca_data_list = []

with open(input_filename, 'rb') as fin:
    for line in fin:
        if not line.strip():
            continue # Skip empty lines
        data = json.loads(line)

        system_prompt = data['metadata']['description']

        input_response = data['text'] # data['text'] contains both user input and response string

        # Split based in first question mark
        qmark_idx = input_response.find('?')
        if qmark_idx != -1:
            human_input = input_response[:qmark_idx+1].strip()
            answer = input_response[qmark_idx+1:].strip()
        else:
            human_input = ""
            answer = ""
        
        alpaca_entry = {
            # 'instruction': system_prompt, # TODO: Need to figure out how instruction and input pair works
            'input': human_input,
            'output': answer,
            'system': system_prompt,
        }

        alpaca_data_list.append(alpaca_entry)

# Write the entire list of objects to a JSON file
with open(output_filename, 'w') as fout:
    json.dump(alpaca_data_list, fout, indent=2)

print("Preprocessing complete. Data saved to:", output_filename)

Preprocessing complete. Data saved to: chess_modeling_instruct.json


In [7]:
line.strip()

b'{"metadata": {"description": "Generate UCI move given SAN-format move and FEN"}, "text": "Taking the FEN configuration q3k1r1/p3np2/1p1Q2pp/8/8/8/6PP/3R2K1 w - - 0 31 and the move expressed in SAN notation Qd7+, could you generate the related UCI move? The resultant move is d6d7.", "pipeline_key": "function_puzzle_v2.jsonl.zst-278847"}'