### Chess modeling dataset EDA

In [1]:
# How many different tasks there are and what are their counts?
import json
from collections import Counter
from tqdm import tqdm

# List your 8 jsonl file names here
jsonl_files = [
    'chess_modeling-data.jsonl-00000-of-00008',
    'chess_modeling-data.jsonl-00001-of-00008',
    'chess_modeling-data.jsonl-00002-of-00008',
    'chess_modeling-data.jsonl-00003-of-00008',
    'chess_modeling-data.jsonl-00004-of-00008',
    'chess_modeling-data.jsonl-00005-of-00008',
    'chess_modeling-data.jsonl-00006-of-00008',
    'chess_modeling-data.jsonl-00007-of-00008'
]

# Set to store unique descriptions
unique_descriptions_cnt = Counter()

for filename in tqdm(jsonl_files):
    with open(filename, 'r') as f:
        for line in f:
            if line.strip():  # Skip empty lines
                data = json.loads(line)
                # Extract the description from metadata (if available)
                description = data.get("metadata", {}).get("description", "")
                unique_descriptions_cnt[description] += 1

print("Number of unique descriptions:", unique_descriptions_cnt)
print("Total number of samples: ", sum(unique_descriptions_cnt.values()))

  0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 8/8 [00:06<00:00,  1.17it/s]

Number of unique descriptions: Counter({'Generate FEN given PGN': 293146, 'Generate FEN given UCI-format move list': 286832, 'Generate all legal moves in UCI format given the board PGN': 148308, 'Generate all legal moves in SAN format given the board PGN': 148301, 'Generate SAN move given UCI-format move and FEN': 147654, 'Generate UCI move given SAN-format move and FEN': 147649, 'Generate next FEN given FEN and SAN format move': 147629, 'Generate next FEN given FEN and UCI format move': 147589, 'Draw chess board given FEN': 146980, 'Generate all legal moves in SAN format given the board FEN': 143764, 'Generate all legal moves given a board FEN': 143694})
Total number of samples:  1901546





### Per question-type Chess modeling dataset preprocessing

### Unused

In [None]:
import chess

# 2. Generate FEN given UCI-format move list
# Remarks: Changed it so that we generate "SAN" notations, not UCI notations
example = \
{
"metadata": 
    {"description": 
        "Generate FEN given UCI-format move list"}, 
"text": 
    "It would be great if you could produce the FEN representation of the chess game based on the provided SAN based move list: e2e4 e7e5 d2d4 e5d4 c2c3 b8c6 f1c4 g8f6 d1f3 d7d6 g1e2 c8g4 f3d3 g4e2 d3e2 f8e7 e1g1 e8g8 c4d3 c6e5 c3d4 e5d3 e2d3 f8e8 b1c3 c7c6 f1e1 d6d5 e4e5 f6d7 d3g3 e7d6 c1h6 d6e5 d4e5 g7g6 h6g5 d8b6 h2h4 b6b2 a1b1 b2c2 e1e2 c2f5 b1b7 d7e5 f2f4 e5g4 e2e8. The FEN code obtained is r3R1k1/pR3p1p/2p3p1/3p1qB1/5PnP/2N3Q1/P5P1/6K1 b - - 0 25.", 
"pipeline_key": 
    "function_puzzle_v2.jsonl.zst-1056409"}

def convert_uci_to_san_type2(uci_moves_str):
    """
    Converts a space-separated string of UCI moves into a list of SAN moves.
    
    Parameters:
        uci_moves_str (str): A string of UCI moves separated by spaces.
        
    Returns:
        list: A list of moves in SAN notation.
    """
    board = chess.Board()  # Start with the initial board position
    uci_moves = uci_moves_str.split()
    san_moves = []
    for move_str in uci_moves:
        move = chess.Move.from_uci(move_str)
        # Convert the move to SAN notation based on the current board state
        san = board.san(move)
        san_moves.append(san)
        board.push(move)  # Update the board with the move
    return san_moves

def generate_fen_given_san_move_list(text: str):
    """
    Processes the provided text by:
      - Checking for a colon in the first sentence to extract and convert the UCI move list to SAN notation.
      - Using all but the last sentence as the human input.
      - Using the final sentence as the model response.
      
    If the text contains only one sentence, it is assumed to be the model response and the human input is left empty.
    
    Parameters:
        text (str): The input text containing the prompt with UCI moves and the FEN output.
    
    Returns:
        tuple: (human_input, model_response)
    """
    # Split the text into sentences (filtering out any empty fragments)
    sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]
    if not sentences:
        return "", ""
    
    # Process the first sentence: if it contains a colon, extract the UCI move list.
    first_sentence = sentences[0]
    if ':' in first_sentence:
        colon_index = first_sentence.find(':')
        prefix = first_sentence[:colon_index+1]  # Up to and including the colon
        uci_moves_str = first_sentence[colon_index+1:].strip()
        san_moves = convert_uci_to_san_type2(uci_moves_str)
        san_moves_str = " ".join(san_moves)
        new_first_sentence = f"{prefix} {san_moves_str}"
    else:
        new_first_sentence = first_sentence
    
    # If there's only one sentence, treat it as the model response.
    if len(sentences) == 1:
        human_input = ""
        model_response = new_first_sentence
    else:
        # For multiple sentences: all but the last are human input.
        human_sentences = [new_first_sentence] + sentences[1:-1]
        human_input = ". ".join(human_sentences) + "."
        model_response = sentences[-1]
        if not model_response.endswith('.'):
            model_response += '.'
    
    return human_input, model_response

po, op = generate_fen_given_san_move_list(example['text'])

In [None]:
import chess
import re

# 3. Generate all legal moves in UCI format given the board PGN
# Remarks: Changed it so that we generate "SAN" notations, not UCI notations
example = \
{"metadata": {"description": "Generate all legal moves in UCI format given the board PGN"}, "text": "Can you generate all legal moves in UCI format for the provided PGN of the chess game: 1. d4 Nf6 2. Nf3 g6 3. c3 Bg7 4. g3 O-O 5. Bg2 d6 6. O-O Nc6 7. Nbd2 a6 8. e4 e5 9. d5 Ne7 10. Qc2 Nh5 11. b3 f5 12. exf5 gxf5 13. Bb2 e4 14. Nh4 Nxd5 15. c4 Ndf6 16. Rad1 Qe7 17. Nb1 Ng4 18. Bxg7 Qxg7 19. Nc3 Bd7 20. Nd5 Rac8 21. h3 Ne5 22. a4 Nf6 23. Nxf6+ Qxf6 24. Qc1 Nd3 25. Qe3 c6 26. Kh2 d5 27. Qb6 Rb8 28. cxd5 cxd5 29. Qc7 Bc6 30. Qa5 f4 31. Qb6 Qg5 32. Qc7 Rbe8 33. Bh1 Re7 34. Qb6 Rg7 35. Rg1 Nxf2 36. Qxf2 e3 37. Qe1 fxg3+ 38. Rxg3 Rf2+ 39. Qxf2 exf2 40. Rxg5 Rxg5 41. Rf1 Rh5 42. Nf3 taking into consideration that the king might be in check? The resulting moves are g8h8 g8f8 g8g7 g8f7 c6e8 c6d7 c6b5 c6a4 h5h6 h5g5 h5f5 h5e5 h5h4 h5h3 h7h6 b7b6 a6a5 d5d4 b7b5.", "pipeline_key": "function_puzzle_v2.jsonl.zst-1848827"}
#{"metadata": {"description": "Generate all legal moves in UCI format given the board PGN"}, "text": "Generate all legal moves in UCI format for the given PGN of chess game: 1. e4 d5 2. Nf3 c6 3. exd5 cxd5 4. d4 Nc6 5. c3 e6 6. Nbd2 Bd6 7. Nb3 Nge7 8. Bg5 h6 9. Be3 O-O 10. Bd3 b6 11. Qd2 Nf5 12. Bxf5 exf5 13. Bxh6 Qf6 14. Bg5 Qg6 15. h4 f6 16. Be3 Re8 17. O-O-O Ba6 18. h5 Qf7 19. Nh4 Bc8 20. Ng6 Qc7 21. h6 Kf7 22. hxg7 Kxg7. Note that it considers the king in check situation. The legal moves in UCI format are: e8h8 e8g8 e8f8 e8d8 e8e7 e8e6 e8e5 e8e4 e8e3 c8d7 c8b7 c8e6 c8a6 a8b8 f7g8 f7g7 f7g6 f7e6 c7d8 c7b8 c7e7 c7d7 c7b7 d6f8 d6e7 d6e5 d6c5 d6f4 d6b4 d6g3 d6a3 d6h2 c6d8 c6b8 c6e7 c6e5 c6a5 c6d4 c6b4 a7a6 b6b5 f5f4 a7a5.", "pipeline_key": "function_puzzle_v2.jsonl.zst-1834014"}
#
#{"metadata": {"description": "Generate all legal moves in UCI format given the board PGN"}, "text": "Can you please provide me with a list of all legal moves in UCI format for the given PGN of the chess game: 1. d4 Nf6 2. c4 e5 3. dxe5 Ng4 4. Nf3 Bc5 5. e3 Nc6 6. Be2 O-O 7. O-O Re8 8. Bd2 Ngxe5 9. Bc3 d6 10. Nbd2 Bf5 11. a3 a5 12. Nd4 Bg6 13. Nxc6 Nxc6 14. Nf3 a4 15. Bd3 Qe7 16. Bxg6 hxg6 17. Nd4 Ne5 18. Qe2 Qh4 19. Nf3 Qh5 20. Rfe1 Qg4 21. Nxe5 Qxe2 22. Rxe2 dxe5 23. Rd1 c6 24. Red2 f6 25. Kf1 while also considering the king's check condition? The resulting moves are c3e5 c3a5 c3d4 c3b4 d2d8 d2d7 d2d6 d2d5 d2d4 d2d3 d2e2 d2c2 g1h1 g1f1 d1f1 d1e1 d1c1 d1b1 d1a1 e3e4 h2h3 g2g3 f2f3 b2b3 h2h4 g2g4 f2f4 b2b4.", "pipeline_key": "function_puzzle_v2.jsonl.zst-1872281"}


def generate_all_legal_moves_in_uci(text: str):
    """
    Processes the provided text by:
      - Extracting the PGN from the prompt (everything after the first colon, up to an optional marker).
      - Removing move numbers and extra commentary.
      - Setting up the board position by playing the PGN moves in order.
      - Generating all legal moves from the final position in UCI notation.
      
    Returns:
        dict: An alpaca format dictionary with the instruction, input, and output.
    """
    # Split the text into sentences. We assume sentences end with a period, question mark, or exclamation mark.
    sentences = re.split(r'(?<=[.?!])\s+', text.strip())
    if len(sentences) < 2:
        raise ValueError("The text should contain at least two sentences.")
    
    # All sentences except the last are considered the input prompt.
    prompt_part = " ".join(sentences[:-1])
    # The final sentence is available as the expected result, but is not used further.
    expected_result_part = sentences[-1]
    
    # Extract the PGN notation from the prompt text
    # First, get the part after the colon which should contain the PGN
    if ":" not in prompt_part:
        raise ValueError("No colon found in prompt for PGN extraction.")
    
    pgn_str = prompt_part.split(":", 1)[1].strip()
    
    # Extract all numbered moves with a reliable regex
    # This pattern captures move pairs OR single moves with a number prefix
    # Handles cases like "1. e4 e5" as well as "42. Nf3"
    move_pattern = r'\d+\.\s*([A-Za-z0-9\-x=+#]+)(?:\s+([A-Za-z0-9\-x=+#]+))?'
    move_matches = re.findall(move_pattern, pgn_str)
    
    # Flatten the list of tuples and filter out empty strings
    pgn_moves = []
    for match in move_matches:
        for move in match:
            if move:  # Only add non-empty moves
                pgn_moves.append(move)
    
    # Print debug info to verify correct extraction
    print(f"Extracted {len(pgn_moves)} moves: {pgn_moves}, (Its from type3)")
    
    # Set up the board by playing the PGN moves.
    board = chess.Board()
    for move in pgn_moves:
        try:
            board.push_san(move)
        except Exception as e:
            print(f"Error processing PGN move '{move}': {e}")
    
    # Generate all legal moves from the final board position in UCI notation.
    legal_moves = list(board.legal_moves)
    san_moves = [board.san(move) for move in legal_moves]
    san_moves_str = " ".join(san_moves)
    
    # Build the model response.
    model_response = f"The resulting moves are {san_moves_str}."
    human_input = prompt_part
    return human_input, model_response


# Run the function with the example text
human_input, model_response = generate_all_legal_moves_in_uci(example['text'])
print("Human Input:")
print(human_input)
print("\nModel Response:")
print(model_response)

### 1. Generate FEN given PGN

In [27]:
# 1. Generate FEN given PGN
example = \
{
"metadata": 
    {"description": "Generate FEN given PGN"}, 
"text": 
    "Generate the FEN representation given the PGN of chess game: 1. e4 g6 2. d4 Bg7 3. c4 c5 4. Nc3 cxd4 5. Nd5 Nc6 6. Nf3 e6 7. Nf4 e5 8. Nd5 h6 9. Bd3 Nge7 10. O-O O-O 11. Bd2 d6 12. Qc1 Kh7 13. Ne1. The FEN is r1bq1r2/pp2npbk/2np2pp/3Np3/2PpP3/3B4/PP1B1PPP/R1Q1NRK1 b - - 3 13.", 
"pipeline_key": 
    "function_puzzle_v2.jsonl.zst-1122107"
}

def generate_fen_given_pgn(text: str):
    text = example['text']
    sentences = [sentence.strip() for sentence in text.split('.') if sentence.strip()]

    input_text = '. '.join(sentences[:-1])
    input_text += '.'
    last_sentence = sentences[-1]
    last_sentence += '.'

    assert text == input_text + " " + last_sentence
    
    return input_text, last_sentence


In [51]:
# Develop on this so that we make <think> </think> style response
example = \
{
"metadata": 
    {"description": "Generate FEN given PGN"}, 
"text": 
    "Generate the FEN representation given the PGN of chess game: 1. e4 g6 2. d4 Bg7 3. c4 c5 4. Nc3 cxd4 5. Nd5 Nc6 6. Nf3 e6 7. Nf4 e5 8. Nd5 h6 9. Bd3 Nge7 10. O-O O-O 11. Bd2 d6 12. Qc1 Kh7 13. Ne1. The FEN is r1bq1r2/pp2npbk/2np2pp/3Np3/2PpP3/3B4/PP1B1PPP/R1Q1NRK1 b - - 3 13.", 
"pipeline_key": 
    "function_puzzle_v2.jsonl.zst-1122107"
}
import random

def simulate_pgn_segment(pgn_text: str):
    """
    Given a PGN string of a chess game, this function:
      1. Extracts all moves (in SAN format) from the PGN.
      2. Randomly selects a starting index among the moves.
      3. Plays the game from the start until that point and computes the intermediate FEN.
      4. Randomly selects between 1 and 5 moves (subject to moves remaining) starting from that index,
         and then applies them to the board.
      5. Generates a natural chain-of-thought reasoning trace that explains, step by step, how
         each move changes the board. For the first move, the current FEN is explicitly mentioned;
         for subsequent moves, the explanation is more concise.
         The reasoning is enclosed in <think>...</think> tags.
      6. Returns:
          - A prompt string (which includes the starting FEN and summary of applied moves),
          - The list of applied SAN moves,
          - The resulting FEN after applying those moves,
          - The natural-language chain-of-thought reasoning trace.
    
    Parameters:
        pgn_text (str): The PGN string including move numbers 
                        (e.g., "1. e4 g6 2. d4 Bg7 3. c4 c5 4. Nc3 cxd4 ...").
    
    Returns:
        tuple: (prompt, applied_moves, result_fen, reasoning_trace)
    """
    
    system_prompt = "Generate the resulting FEN position given the following SAN moves, starting from the initial FEN."

    # Define a regex pattern to capture moves.
    move_pattern = r'\d+\.\s*([^\s]+)(?:\s+([^\s]+))?'
    move_matches = re.findall(move_pattern, pgn_text)
    
    # Flatten the matches into a list of moves (stripping any trailing period)
    pgn_moves = []
    for first, second in move_matches:
        if first:
            pgn_moves.append(first.rstrip('.'))
        if second:
            pgn_moves.append(second.rstrip('.'))
    
    if not pgn_moves:
        raise ValueError("No moves extracted from the PGN text.")
    
    # Optionally remove a trailing move that might be an annotation/result.
    pgn_moves = pgn_moves[:-1]
    
    # Create a board instance (using the standard initial position)
    board = chess.Board()
    
    # Randomly choose a starting index among all moves.
    start_index = random.randint(0, len(pgn_moves) - 1)
    
    # Play all moves up to the starting index to obtain the intermediate FEN.
    for move in pgn_moves[:start_index]:
        try:
            board.push_san(move)
        except Exception as e:
            raise ValueError(f"Error applying move '{move}' at index {pgn_moves.index(move)}: {e}")
    
    intermediate_fen = board.fen()
    starting_fen_str = f"The current FEN is {intermediate_fen}."
    
    # Determine how many moves we can apply from the remaining moves.
    remaining_moves = len(pgn_moves) - start_index
    num_moves_to_apply = random.randint(1, min(5, remaining_moves))
    
    # Extract the moves to apply.
    moves_to_apply = pgn_moves[start_index : start_index + num_moves_to_apply]
    
    # Build a natural chain-of-thought reasoning trace.
    reasoning_lines = []
    board_trace = board.copy()  # Copy the board for a step-by-step trace.

    # Define templates for the first move (7 examples) without using "I".
    first_move_templates = [
        "Starting from FEN {current_fen}, the move '{move}' is played, resulting in {new_fen}.",
        "From FEN {current_fen}, move '{move}' leads to {new_fen}.",
        "At FEN {current_fen}, the move '{move}' produces {new_fen}.",
        "Beginning at FEN {current_fen}, playing '{move}' updates the board to {new_fen}.",
        "FEN {current_fen} changes to {new_fen} after move '{move}' is executed.",
        "With the starting FEN at {current_fen}, move '{move}' transforms the board into {new_fen}.",
        "From the initial FEN {current_fen}, move '{move}' yields {new_fen}."
    ]

    # Define templates for subsequent moves (7 examples) without using "I".
    subsequent_move_templates = [
        "Then, move '{move}' is applied, resulting in {new_fen}.",
        "Following that, '{move}' changes the position to {new_fen}.",
        "Next, the move '{move}' updates the board to {new_fen}.",
        "Afterwards, '{move}' is executed, producing {new_fen}.",
        "Subsequently, move '{move}' yields {new_fen}.",
        "After move '{move}', the board becomes {new_fen}.",
        "Finally, applying '{move}' gives a new FEN: {new_fen}."
    ]
    
    # Build the reasoning trace move by move.
    for idx, move in enumerate(moves_to_apply):
        current_fen = board_trace.fen()
        try:
            board_trace.push_san(move)
        except Exception as e:
            raise ValueError(f"Error applying move '{move}' during trace: {e}")
        new_fen = board_trace.fen()
        if idx == 0:
            template = random.choice(first_move_templates)
            line = template.format(current_fen=current_fen, move=move, new_fen=new_fen)
        else:
            template = random.choice(subsequent_move_templates)
            line = template.format(move=move, new_fen=new_fen)
        reasoning_lines.append(line)
    
    # Wrap the reasoning trace in <think> tags.
    reasoning_trace = "<think>" + " ".join(reasoning_lines) + "</think>"
    
    moves_summary = ", ".join(moves_to_apply)
    moves_summary_full = f" And the next applied SAN moves are: {moves_summary}."

    think_action_tags = f"Use <think>...</think> tags to explain your reasoning and <answer>...</answer> tags to give the final FEN."
    # Build the system prompt string.
    user_prompt = f"{starting_fen_str}{moves_summary_full} {think_action_tags}"
    
    # Now, apply the selected moves to the original board (which is at the intermediate state) to get the final FEN.
    for move in moves_to_apply:
        try:
            board.push_san(move)
        except Exception as e:
            raise ValueError(f"Error applying move '{move}' from index {start_index}: {e}")
    result_fen = board.fen()
    answer_response = f" <answer>{result_fen}</answer>"

    assistant_prompt = reasoning_trace + model_response
    return system_prompt, user_prompt, assistant_prompt

system_prompt, user_prompt, assistant_prompt = simulate_pgn_segment(example['text'])

print("System Prompt:")
print(system_prompt)
print("User Prompt:")
print(user_prompt)
print("Assistant Prompt:")
print(assistant_prompt)

System Prompt:
Generate the resulting FEN position given the following SAN moves, starting from the initial FEN.
User Prompt:
The current FEN is rnbqk1nr/ppppppbp/6p1/8/2PPP3/8/PP3PPP/RNBQKBNR b KQkq - 0 3. And the next applied SAN moves are: c5, Nc3. Use <think>...</think> tags to explain your reasoning and <answer>...</answer> tags to give the final FEN.
Assistant Prompt:
<think>At FEN rnbqk1nr/ppppppbp/6p1/8/2PPP3/8/PP3PPP/RNBQKBNR b KQkq - 0 3, the move 'c5' produces rnbqk1nr/pp1pppbp/6p1/2p5/2PPP3/8/PP3PPP/RNBQKBNR w KQkq - 0 4. Following that, 'Nc3' changes the position to rnbqk1nr/pp1pppbp/6p1/2p5/2PPP3/2N5/PP3PPP/R1BQKBNR b KQkq - 1 4.</think><think>Starting from FEN r1bqk1nr/pp1p1pbp/2n1p1p1/8/2PpPN2/5N2/PP3PPP/R1BQKB1R b KQkq - 1 7, the move 'e5' is played, resulting in r1bqk1nr/pp1p1pbp/2n3p1/4p3/2PpPN2/5N2/PP3PPP/R1BQKB1R w KQkq - 0 8. Following that, 'Nd5' changes the position to r1bqk1nr/pp1p1pbp/2n3p1/3Np3/2PpP3/5N2/PP3PPP/R1BQKB1R b KQkq - 1 8. Finally, applying 'h6' gi

### 2. Generate next FEN given FEN and SAN format move

In [2]:
import re

example = \
{"metadata": 
    {"description": "Generate next FEN given FEN and SAN format move"}, 
"text": "Provided the chess board's FEN rnb1kb1r/ppq2ppp/6n1/8/2B5/1QN5/PP3PPP/R1B1K1NR b KQkq - 0 9, after making the SAN format move Bd6, what does the next board's FEN look like? The modified FEN is rnb1k2r/ppq2ppp/3b2n1/8/2B5/1QN5/PP3PPP/R1B1K1NR w KQkq - 1 10.", 
"pipeline_key": "function_puzzle_v2.jsonl.zst-772395"}
#{"metadata": 
#   {"description": "Generate next FEN given FEN and SAN format move"}, 
# "text": "With the chess board represented by FEN r1br2k1/4n1pp/p3Pp2/qpP2P2/8/2PB2QP/P1P5/2KR3R b - - 0 22 and a move specified in SAN format Qxc3, what is the FEN of the subsequent board? The resulting FEN is r1br2k1/4n1pp/p3Pp2/1pP2P2/8/2qB2QP/P1P5/2KR3R w - - 0 23.", "pipeline_key": "function_puzzle_v2.jsonl.zst-367137"}
#{"metadata": 
#   {"description": "Generate next FEN given FEN and SAN format move"}, 
# "text": "Starting from the chess board FEN rnbqk2r/pp2bppp/3p1n2/2pP4/4P3/2N2N2/PP3PPP/R1BQKB1R b KQkq - 4 7, and executing the SAN format move Bg4, what is the FEN representation of the following board? The derived FEN is rn1qk2r/pp2bppp/3p1n2/2pP4/4P1b1/2N2N2/PP3PPP/R1BQKB1R w KQkq - 5 8.", "pipeline_key": "function_puzzle_v2.jsonl.zst-1375542"}

def generate_fen_given_fen_and_san(text: str):
    """
    Splits the provided text into sentences so that:
      - All sentences except the final one are treated as the input prompt.
      - The final sentence is treated as the response.
    
    Returns:
        tuple: A tuple (prompt_text, response_text), where prompt_text is the concatenation
        of all sentences except the final one and response_text is the last sentence.
    """
    text = text.strip()
    # Split the text into sentences, taking into account '.', '?' and '!' as sentence terminators.
    sentences = re.split(r'(?<=[.?!])\s+', text)
    if len(sentences) < 2:
        raise ValueError("Input must contain at least two sentences.")
    
    # The prompt is all sentences except the last one.
    prompt_text = " ".join(sentences[:-1]).strip()
    # The final sentence is taken as the response.
    response_text = sentences[-1].strip()
    
    return prompt_text, response_text

# Run the function with the example text
human_input, model_response = generate_fen_given_fen_and_san(example['text'])
print("Human Input:")
print(human_input)
print("\nModel Response:")
print(model_response)

Human Input:
Provided the chess board's FEN rnb1kb1r/ppq2ppp/6n1/8/2B5/1QN5/PP3PPP/R1B1K1NR b KQkq - 0 9, after making the SAN format move Bd6, what does the next board's FEN look like?

Model Response:
The modified FEN is rnb1k2r/ppq2ppp/3b2n1/8/2B5/1QN5/PP3PPP/R1B1K1NR w KQkq - 1 10.


### 3. Draw chess board given FEN (Change it to ours style. I think we need the file and rank notation)

In [70]:
import re

example = \
{"metadata": {"description": "Draw chess board given FEN"}, "text": "Considering the FEN input provided r1bqk2r/pppp1ppp/2n2n2/2b1p3/2B1P3/2PP1N2/PP3PPP/RNBQK2R b KQkq - 2 7, craft a chessboard image using only letter characters. Gaze upon the masterpiece here: r . b q k . . r\np p p p . p p p\n. . n . . n . .\n. . b . p . . .\n. . B . P . . .\n. . P P . N . .\nP P . . . P P P\nR N B Q K . . R.", "pipeline_key": "function_puzzle_v2.jsonl.zst-44114"}
#{"metadata": {"description": "Draw chess board given FEN"}, "text": "Given the chess board FEN 6rk/2p2p1p/2p2p2/pp6/4P3/1PPbNP2/P2r2PP/R2BR1K1 w - - 3 21. Can you draw it using only letters? Here is the draw . . . . . . r k\n. . p . . p . p\n. . p . . p . .\np p . . . . . .\n. . . . P . . .\n. P P b N P . .\nP . . r . . P P\nR . . B R . K ..", "pipeline_key": "function_puzzle_v2.jsonl.zst-171148"}
#{"metadata": {"description": "Draw chess board given FEN"}, "text": "Taking into account the FEN input r4rk1/6p1/1R1p4/p1pPp3/P1P2p2/1Q3P1b/4K2R/2q5 b - - 1 32, could you design a chessboard picture using only letter characters? Observe the outcome here: r . . . . r k .\n. . . . . . p .\n. R . p . . . .\np . p P p . . .\nP . P . . p . .\n. Q . . . P . b\n. . . . K . . R\n. . q . . . . ..", "pipeline_key": "function_puzzle_v2.jsonl.zst-1077346"}

def draw_board_given_fen(text: str):
    """
    Processes the provided text by:
      - Extracting the FEN string from the text.
      - Splitting the provided text into sentences so that:
          - All sentences except the final one are treated as the input prompt.
          - The final sentence is treated as the response.
    
    Returns:
        tuple: A tuple (fen, prompt_text, response_text), where 'fen' is the extracted FEN string,
               'prompt_text' is the concatenation of all sentences except the final one, and
               'response_text' is the final sentence.
    """

    system_prompt = "Given a FEN string, draw the current chessboard position."

    # First, extract the FEN.
    # The FEN is expected to follow the standard structure:
    # [piece placement] [side to move] [castling availability] [en passant target square] [halfmove clock] [fullmove number]
    text = text.strip()
    
    # Robust FEN pattern:
    #   - The board part must have exactly 8 ranks separated by 7 slashes.
    #   - Followed by side to move (w or b), castling availability, en passant target, halfmove clock, and fullmove number.
    fen_pattern = (r'((?:[rnbqkpRNBQKP1-8]+/){7}[rnbqkpRNBQKP1-8]+)\s+'
                   r'([wb])\s+'
                   r'([-KQkq]+)\s+'
                   r'((?:[a-h][36])|-)\s+'
                   r'(\d+)\s+'
                   r'(\d+)')
    fen_match = re.search(fen_pattern, text)
    if not fen_match:
        raise ValueError("No valid FEN string found in the input text.")
    
    fen = fen_match.group(1)

    def textify_chess_board(fen: str):
        board = chess.Board(fen)
        # Create a piece table from the updated board.
        piece_table = []
        board_rows = str(board).splitlines()
        for row_index, board_row in enumerate(board_rows):
            rank = 8 - row_index
            squares = board_row.split()
            for col_index, square in enumerate(squares):
                file = chr(ord('a') + col_index)
                piece = square if square != '.' else '-'  # Replace '.' with '-' for empty squares
                piece_table.append(f"{file}{rank}:{piece}")
        piece_table = " ".join(piece_table)
        return piece_table

    new_chess_board_str = textify_chess_board(fen)
    
    # Extract the substring between the period and the colon.
    user_prompt = f"Current FEN string is {fen}."

    assistant_prompt = f"{new_chess_board_str}."

    return system_prompt, user_prompt, assistant_prompt


system_prompt, user_prompt, assistant_prompt = draw_board_given_fen(example['text'])

print("System Prompt:")
print(system_prompt)
print("User Prompt:")
print(user_prompt)
print("Assistant Prompt:")
print(assistant_prompt)

System Prompt:
Given a FEN string, draw the current chessboard position.
User Prompt:
Current FEN string is r1bqk2r/pppp1ppp/2n2n2/2b1p3/2B1P3/2PP1N2/PP3PPP/RNBQK2R.
Assistant Prompt:
a8:r b8:- c8:b d8:q e8:k f8:- g8:- h8:r a7:p b7:p c7:p d7:p e7:- f7:p g7:p h7:p a6:- b6:- c6:n d6:- e6:- f6:n g6:- h6:- a5:- b5:- c5:b d5:- e5:p f5:- g5:- h5:- a4:- b4:- c4:B d4:- e4:P f4:- g4:- h4:- a3:- b3:- c3:P d3:P e3:- f3:N g3:- h3:- a2:P b2:P c2:- d2:- e2:- f2:P g2:P h2:P a1:R b1:N c1:B d1:Q e1:K f1:- g1:- h1:R.


### 4. Generate all legal moves in SAN format given the board FEN

In [68]:
import re

example = \
{"metadata": {"description": "Generate all legal moves in SAN format given the board FEN"}, "text": "Generate all legal moves in SAN format for the given FEN of chess game: 5r1k/p1pq2pp/1p1b1n2/3b4/3Pp3/1P3PP1/PBQ3BP/2R2RK1 w - - 0 21. Note that it considers the king in check situation. The legal moves in SAN format are: Bh3 Bh1 Qxc7 Qc6 Qc5 Qxe4 Qc4 Qd3 Qc3 Qf2 Qe2 Qd2 Qd1 Qb1 Bc3 Ba3 Ba1 Kf2 Kh1 Rf2 Rfe1 Rfd1 Rce1 Rcd1 Rb1 Ra1 fxe4 g4 f4 b4 h3 a3 h4 a4.", "pipeline_key": "function_puzzle_v2.jsonl.zst-1564553"}
# {"metadata": {"description": "Generate all legal moves in SAN format given the board FEN"}, "text": "Can you please provide me with a list of all legal moves in SAN format for the given FEN of the chess game: r1bqkbnr/pp1p1pp1/2n1p2p/8/2BNP3/8/PPP2PPP/RNBQK2R w KQkq - 0 6 while also considering the king's check condition? The resulting moves are Nxe6 Nxc6 Nf5 Nb5 Nf3 Nb3 Ne2 Bxe6 Ba6 Bd5 Bb5 Bd3 Bb3 Be2 Bf1 Rg1 Rf1 Ke2 Kd2 Kf1 Qh5 Qg4 Qf3 Qd3 Qe2 Qd2 Bxh6 Bg5 Bf4 Be3 Bd2 Nc3 Na3 Nd2 O-O e5 h3 g3 f3 c3 b3 a3 h4 g4 f4 b4 a4.", "pipeline_key": "function_puzzle_v2.jsonl.zst-1532968"}
# {"metadata": {"description": "Generate all legal moves in SAN format given the board FEN"}, "text": "May I request you to generate all legal moves in SAN format for the given FEN of the chess game: 6k1/p2q1pbp/P1p1p1p1/1rB2n2/2QP4/3P1NPP/5P2/R4K2 b - - 0 32 while also considering the possibility of the king being in check? The resulting moves are Kh8 Bh8 Bf8 Bh6 Bf6 Be5 Bxd4 Qe8 Qd8 Qc8 Qe7 Qc7 Qb7 Qd6 Qd5 Qxd4 Ne7 Nh6 Nd6 Nh4 Nxd4 Nxg3+ Ne3+ Rb8 Rb7 Rb6 Rxc5 Ra5 Rb4 Rb3 Rb2 Rb1+ h6 f6 g5 e5 h5.", "pipeline_key": "function_puzzle_v2.jsonl.zst-1593663"}
# {"metadata": {"description": "Generate all legal moves in SAN format given the board FEN"}, "text": "Could you produce a list of all legal moves in SAN format for the given FEN of the chess game: r4rk1/p5Np/bnp2ppB/4q3/2p4Q/1P6/P2N1PPP/R2R2K1 w - - 0 18 while also considering the possibility of the king being in check? The resulting moves are Ne8 Ne6 Nh5 Nf5 Bg5 Bf4 Be3 Qxf6 Qh5 Qg5 Qg4 Qf4 Qe4 Qd4 Qxc4+ Qh3 Qg3 Ne4 Nxc4 Nf3 Nf1 Nb1 Kh1 Kf1 Rf1 Re1 Rdc1 Rdb1 Rac1 Rab1 bxc4 b4 h3 g3 f3 a3 g4 f4 a4.", "pipeline_key": "function_puzzle_v2.jsonl.zst-1587325"}


def generate_legal_moves_given_fen(text: str):
    """
    Splits the provided text into sentences so that:
      - All sentences except the final one are treated as the input prompt.
      - The final sentence is treated as the response.
    
    Returns:
        tuple: A tuple (prompt_text, response_text), where prompt_text is the concatenation
        of all sentences except the final one and response_text is the last sentence.
    """
    system_prompt = "Generate all legal moves in SAN format for the given FEN board state."
    
    text = text.strip()
    
    # Use regex to find the FEN substring.
    fen_pattern = r"([rnbqkpRNBQKP1-8/]+\s+[wb]\s+[-KQkq]+\s+(?:[a-h][36]|-)\s+\d+\s+\d+)"
    fen_match = re.search(fen_pattern, text)
    if fen_match:
        # Reconstruct the FEN using all captured groups.
        fen_str = fen_match.group(1).strip()
    else:
        raise ValueError("No valid FEN found in the provided text.")
    
    # Create the user prompt using only the current FEN board state.
    user_prompt = f"Current FEN board state is {fen_str}."
    
    # Split the text into sentences by looking for punctuation that ends a sentence.
    sentences = re.split(r'(?<=[.?!])\s+', text)
    if len(sentences) < 2:
        raise ValueError("Input must contain at least two sentences.")
    
    # We assume the final sentence is the assistant's prompt.
    assistant_prompt = sentences[-1].strip()
    
    return system_prompt, user_prompt, assistant_prompt

# Run the function with the example text
system_prompt, user_prompt, assistant_prompt = generate_legal_moves_given_fen(example['text'])

print("System Prompt:")
print(system_prompt)
print("User Prompt:")
print(user_prompt)
print("Assistant Prompt:")
print(assistant_prompt)

System Prompt:
Generate all legal moves in SAN format for the given FEN board state.
User Prompt:
Current FEN board state is 5r1k/p1pq2pp/1p1b1n2/3b4/3Pp3/1P3PP1/PBQ3BP/2R2RK1 w - - 0 21.
Assistant Prompt:
The legal moves in SAN format are: Bh3 Bh1 Qxc7 Qc6 Qc5 Qxe4 Qc4 Qd3 Qc3 Qf2 Qe2 Qd2 Qd1 Qb1 Bc3 Ba3 Ba1 Kf2 Kh1 Rf2 Rfe1 Rfd1 Rce1 Rcd1 Rb1 Ra1 fxe4 g4 f4 b4 h3 a3 h4 a4.


### Full Chess modeling dataset preprocessing

In [12]:
import json

input_filename = "chess_modeling-data.jsonl-00000-of-00008"
output_filename= "chess_modeling_instruct.json"

alpaca_data_list = []

with open(input_filename, 'rb') as fin:
    for line in fin:
        if not line.strip():
            continue # Skip empty lines
        data = json.loads(line)

        system_prompt = data['metadata']['description']

        input_response = data['text'] # data['text'] contains both user input and response string

        # Split based in first question mark
        qmark_idx = input_response.find('?')
        if qmark_idx != -1:
            human_input = input_response[:qmark_idx+1].strip()
            answer = input_response[qmark_idx+1:].strip()
        else:
            human_input = ""
            answer = ""
        
        alpaca_entry = {
            # 'instruction': system_prompt, # TODO: Need to figure out how instruction and input pair works
            'input': human_input,
            'output': answer,
            'system': system_prompt,
        }

        alpaca_data_list.append(alpaca_entry)

# Write the entire list of objects to a JSON file
with open(output_filename, 'w') as fout:
    json.dump(alpaca_data_list, fout, indent=2)

print("Preprocessing complete. Data saved to:", output_filename)

Preprocessing complete. Data saved to: chess_modeling_instruct.json


In [7]:
line.strip()

b'{"metadata": {"description": "Generate UCI move given SAN-format move and FEN"}, "text": "Taking the FEN configuration q3k1r1/p3np2/1p1Q2pp/8/8/8/6PP/3R2K1 w - - 0 31 and the move expressed in SAN notation Qd7+, could you generate the related UCI move? The resultant move is d6d7.", "pipeline_key": "function_puzzle_v2.jsonl.zst-278847"}'