<a href="https://colab.research.google.com/github/jorahn/chess-to-text/blob/main/chess_language_transformation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install chess datasets



In [2]:
import random
import chess
from datasets import load_dataset

In [3]:
dataset = load_dataset("laion/strategic_game_chess",
    data_files="chess_game_0001.parquet") # load just one 528mb file with ~2m games initially

dataset

DatasetDict({
    train: Dataset({
        features: ['Moves', 'Termination', 'Result'],
        num_rows: 1999939
    })
})

In [4]:
example = dataset["train"][0]
example

{'Moves': ['d2d4',
  'f7f5',
  'g2g3',
  'g7g6',
  'f1g2',
  'f8g7',
  'g1f3',
  'd7d6',
  'c2c3',
  'e7e6',
  'a2a4',
  'g8f6',
  'd1c2',
  'd8e7',
  'b1d2',
  'e6e5',
  'd4e5',
  'd6e5',
  'e2e4',
  'b8c6',
  'e1g1',
  'f5e4',
  'd2e4',
  'c8f5',
  'f3d2',
  'e8c8',
  'b2b4',
  'g7h6',
  'f1e1',
  'h6d2',
  'c1d2',
  'f6e4',
  'g2e4',
  'e7e6',
  'd2g5',
  'd8d6',
  'a1d1',
  'd6d1',
  'e1d1',
  'h7h6',
  'g5e3',
  'a7a5',
  'c2b1',
  'h6h5',
  'b4b5',
  'c6e7',
  'e3g5',
  'h8e8',
  'h2h4',
  'e6c4',
  'd1e1',
  'f5e4',
  'e1e4',
  'c4e6',
  'g5f4',
  'e6f5',
  'f4e5',
  'e7d5',
  'b1e1',
  'd5b6',
  'f2f4',
  'b6d7',
  'e1e2',
  'b7b6',
  'e4e3',
  'e8e7',
  'e3e4',
  'd7c5',
  'e4d4',
  'e7d7',
  'g1g2',
  'c8d8',
  'g2h2',
  'd8c8',
  'e2g2',
  'c8b8',
  'g2a2',
  'b8a7',
  'a2g2',
  'a7b8',
  'g2e2',
  'b8c8',
  'e2f3',
  'c8b8',
  'f3d1',
  'b8c8',
  'd1e2',
  'c8b8',
  'e2d1',
  'b8b7',
  'd4d7',
  'c5d7',
  'e5d4',
  'd7c5',
  'h2g2',
  'f5d5',
  'g2g1',
  'd5f5',
  'd4c5',
 

# Transformations
Source data is in UCI-PGN (https://en.wikipedia.org/wiki/Universal_Chess_Interface) format with ~200 engine moves per game. Includes result and reason for termination (like repetition, insufficient material, checkmate).  
Transformation targets:
- UCI-PGN Moves -> SAN-PGN Moves
- UCI-PGN Moves -> LAN-PGN Moves
- UCI-PGN Moves -> FEN Position

In [5]:
def uci_to_san(uci_moves, max_half_moves=None):
    # https://en.wikipedia.org/wiki/Algebraic_notation_(chess)#Formatting
    board = chess.Board()
    san_moves = []
    if max_half_moves: uci_moves = uci_moves[:max_half_moves]
    for move_uci in uci_moves:
        move = chess.Move.from_uci(move_uci)
        san_moves.append(board.san(move))
        board.push(move)
    return san_moves

def uci_to_lan(uci_moves, max_half_moves=None):
    # https://en.wikipedia.org/wiki/Algebraic_notation_(chess)#Long_algebraic_notation
    board = chess.Board()
    lan_moves = []
    if max_half_moves: uci_moves = uci_moves[:max_half_moves]
    for move_uci in uci_moves:
        move = chess.Move.from_uci(move_uci)
        lan_moves.append(board.lan(move))
        board.push(move)
    return lan_moves

def uci_to_fen(uci_moves, max_half_moves=None):
    # https://en.wikipedia.org/wiki/Forsyth%E2%80%93Edwards_Notation
    board = chess.Board()
    if max_half_moves: uci_moves = uci_moves[:max_half_moves]
    for move in uci_moves:
        board.push_uci(move)
    return board.fen()

def uci_to_bitboard(uci_moves, max_half_moves=None):
    board = chess.Board()
    if max_half_moves: uci_moves = uci_moves[:max_half_moves]
    for move in uci_moves:
        board.push_uci(move)
    return str(board)

In [6]:
uci_to_san(example["Moves"], max_half_moves=10)

['d4', 'f5', 'g3', 'g6', 'Bg2', 'Bg7', 'Nf3', 'd6', 'c3', 'e6']

In [7]:
uci_to_lan(example["Moves"], max_half_moves=15)

['d2-d4',
 'f7-f5',
 'g2-g3',
 'g7-g6',
 'Bf1-g2',
 'Bf8-g7',
 'Ng1-f3',
 'd7-d6',
 'c2-c3',
 'e7-e6',
 'a2-a4',
 'Ng8-f6',
 'Qd1-c2',
 'Qd8-e7',
 'Nb1-d2']

In [8]:
uci_to_fen(example["Moves"], max_half_moves=10), uci_to_fen(example["Moves"])

('rnbqk1nr/ppp3bp/3pp1p1/5p2/3P4/2P2NP1/PP2PPBP/RNBQK2R w KQkq - 0 6',
 '8/k1p5/1p6/7Q/7P/1KP3P1/8/1q6 w - - 35 100')

In [9]:
uci_to_bitboard(example["Moves"])

'. . . . . . . .\nk . p . . . . .\n. p . . . . . .\n. . . . . . . Q\n. . . . . . . P\n. K P . . . P .\n. . . . . . . .\n. q . . . . . .'

# Lanugage Templates
Insert transformed moves into templates for natural language.

In [10]:
templates_moves = [
    "{previous_moves} {next_move}",
    "{previous_moves} and then {next_move}.",
    "{previous_moves} followed by {next_move}.",
    "After '{previous_moves}' play {next_move}.",
    "Continue '{previous_moves}' with {next_move}",
    "In a game of chess, after '{previous_moves}' play {next_move}.",
    "In the game of chess, a good move after '{previous_moves}' is {next_move}.",
    "In chess, after the sequence '{previous_moves}' a good move is {next_move}.",
    "In chess after the moves '{previous_moves}', {next_move} is a good move.",
]

templates_fen_move = [
    "In the position '{position}' play {next_move}.",
    "Given the chess position '{position}', a good next move is {next_move}",
    "Starting from the FEN '{position}', a good next move is {next_move}",
]

templates_bitboard_move = [
    "In the position '{position}' play {next_move}.",
    "Given the chess position '{position}', a good next move is {next_move}",
    "Starting from the bitboard position '{position}', a good move is {next_move}",
]

templates_moves_fen = [
    "After '{previous_moves}' the position is '{position}'.",
    "Generate the FEN representation given the PGN of chess game: '{previous_moves}'. The FEN is '{position}'.",
    "Can you produce the FEN code that corresponds to the provided PGN for the chess game: '{previous_moves}'? The FEN notation obtained is '{position}'.",
    "I would appreciate it if you could generate the Forsyth–Edwards Notation (FEN) that corresponds to the given PGN for the chess game: '{previous_moves}'. The FEN code generated is '{position}'.",
    "Please generate the FEN notation for the chess game using the provided PGN: '{previous_moves}'. The obtained FEN representation is '{position}'.",
    "Would you mind generating the FEN code that corresponds to the given PGN for the chess game? The PGN is '{previous_moves}'. The FEN notation generated is '{position}'.",
    "I request you to generate the FEN notation for the given PGN of the chess game: '{previous_moves}'. The FEN code obtained is '{position}'.",
    "Kindly generate the FEN representation that corresponds to the provided PGN of the chess game '{previous_moves}'. The obtained FEN notation is '{position}'.",
    "Could you generate the FEN code for the chess game using the provided PGN: '{previous_moves}'? The FEN notation generated is '{position}'.",
    "It would be great if you could generate the Forsyth–Edwards Notation (FEN) for the chess game using the given PGN '{previous_moves}'. The FEN representation obtained is '{position}'.",
    "May I request you to produce the FEN notation for the provided PGN of the chess game: '{previous_moves}'? The FEN code obtained is '{position}'.",
    "Please generate the FEN code for the chess game based on the provided PGN: '{previous_moves}'. The FEN notation obtained is '{position}'.",
]

templates_moves_bitboard = [
    "After '{previous_moves}' the bitboard position is '{position}'.",
    "Generate the bitboard representation given the PGN of chess game: '{previous_moves}'. The bitboard is '{position}'.",
    "Can you produce the bitboard code that corresponds to the provided PGN for the chess game: '{previous_moves}'? The bitboard obtained is '{position}'.",
    "I would appreciate it if you could generate the bitboard that corresponds to the given PGN for the chess game: '{previous_moves}'. The bitboard code generated is '{position}'.",
    "Please generate the bitboard for the chess game using the provided PGN: '{previous_moves}'. The obtained bitboard representation is '{position}'.",
    "Would you mind generating the bitboard code that corresponds to the given PGN for the chess game? The PGN is '{previous_moves}'. The bitboard generated is '{position}'.",
    "I request you to generate the bitboard for the given PGN of the chess game: '{previous_moves}'. The bitboard obtained is '{position}'.",
    "Kindly generate the bitboard representation that corresponds to the provided PGN of the chess game '{previous_moves}'. The obtained bitboard notation is '{position}'.",
    "Could you generate the bitboard code for the chess game using the provided PGN: '{previous_moves}'? The bitboard generated is '{position}'.",
    "It would be great if you could generate the bitboard for the chess game using the given PGN '{previous_moves}'. The bitboard representation obtained is '{position}'.",
    "May I request you to produce the bitboard for the provided PGN of the chess game: '{previous_moves}'? The bitboard code obtained is '{position}'.",
    "Please generate the bitboard code for the chess game based on the provided PGN: '{previous_moves}'. The bitboard obtained is '{position}'.",
]

templates_moves_result = [
    "After '{moves}' the result is {result}.",
    "A game of chess with the moves '{moves}' ends with the result {result}.",
]

templates_position_move_result = [
    "Given the chess position '{position}', after the move {next_move} the result is {result}.",
    "Starting from the position '{position}' the move {next_move} is played. The game ends with the result {result}.",
]

templates_moves_result_termination = [
    "After '{moves}' the game ends due to {termination} with the result {result}.",
    "A game of chess with the moves '{moves}' ends with the result {result} due to {termination}.",
]

templates_position_move_result_termination = [
    "Given the chess position '{position}', after the move {next_move} the result is {result} due to {termination}.",
    "Starting from the position '{position}' the move {next_move} is played. The game ends due to {termination} with the result {result}.",
]


# Load Data, apply random Transformations & Templates

In [11]:
choices = [
    "uci_nextmove",
    "san_nextmove",
    "lan_nextmove",
    "lan_nextmove_piece",
    "fen_nextmove",
    "fen_position",
    "bitboard_nextmove",
    "bitboard_position",
    "uci_result",
    "san_result",
    "lan_result",
    "uci_result_termination",
    "san_result_termination",
    "lan_result_termination",
    "fen_move_result",
    "bitboard_move_result",
    "fen_move_result_termination",
    "bitboard_move_result_termination",
]

In [12]:
def transform_example(example, fixed_tfm=None):
    tfm = fixed_tfm if fixed_tfm else random.choice(choices)

    if tfm == "uci_nextmove":
        half_moves = random.randint(1, len(example["Moves"])-2)
        previous_moves = " ".join(example["Moves"][:half_moves])
        next_move = example["Moves"][half_moves]

        template = random.choice(templates_moves)
        result = template.format(previous_moves=previous_moves, next_move=next_move)

    elif tfm == "san_nextmove":
        half_moves = random.randint(1, len(example["Moves"])-2)
        moves = uci_to_san(example["Moves"], max_half_moves=half_moves)
        previous_moves = " ".join(moves[:-1])
        next_move = moves[-1]

        template = random.choice(templates_moves)
        result = template.format(previous_moves=previous_moves, next_move=next_move)

    elif tfm == "lan_nextmove":
        half_moves = random.randint(1, len(example["Moves"])-2)
        moves = uci_to_lan(example["Moves"], max_half_moves=half_moves)
        previous_moves = " ".join(moves[:-1])
        next_move = moves[-1]

        template = random.choice(templates_moves)
        result = template.format(previous_moves=previous_moves, next_move=next_move)

    elif tfm == "lan_nextmove_piece":
        half_moves = random.randint(1, len(example["Moves"])-2)
        moves = uci_to_lan(example["Moves"], max_half_moves=half_moves)
        previous_moves = " ".join(moves[:-1])
        next_move = moves[-1]
        if next_move.startswith("N"):
            next_move_piece = f"Night {next_move[1:]}"
        elif next_move.startswith("B"):
            next_move_piece = f"Bishop {next_move[1:]}"
        elif next_move.startswith("R"):
            next_move_piece = f"Rook {next_move[1:]}"
        elif next_move.startswith("Q"):
            next_move_piece = f"Queen {next_move[1:]}"
        elif next_move.startswith("K"):
            next_move_piece = f"King {next_move[1:]}"
        else:
            next_move_piece = f"Pawn {next_move[1:]}"
        next_move_piece = next_move_piece.replace("-", " to ")
        next_move_piece = next_move_piece.replace("x", " captures ")

        template = random.choice(templates_moves)
        result = template.format(previous_moves=previous_moves, next_move=next_move_piece)

    elif tfm == "fen_nextmove":
        half_moves = random.randint(1, len(example["Moves"])-2)
        position = uci_to_fen(example["Moves"], max_half_moves=half_moves)
        next_move = example["Moves"][-1]

        template = random.choice(templates_fen_move)
        result = template.format(position=position, next_move=next_move)

    elif tfm == "fen_position":
        half_moves = random.randint(1, len(example["Moves"])-2)
        position = uci_to_fen(example["Moves"], max_half_moves=half_moves)
        previous_moves = " ".join(example["Moves"][:half_moves])

        template = random.choice(templates_moves_fen)
        result = template.format(position=position, previous_moves=previous_moves)

    elif tfm == "bitboard_nextmove":
        half_moves = random.randint(1, len(example["Moves"])-2)
        position = uci_to_bitboard(example["Moves"], max_half_moves=half_moves)
        next_move = example["Moves"][-1]

        template = random.choice(templates_bitboard_move)
        result = template.format(position=position, next_move=next_move)

    elif tfm == "bitboard_position":
        half_moves = random.randint(1, len(example["Moves"])-2)
        position = uci_to_bitboard(example["Moves"], max_half_moves=half_moves)
        previous_moves = " ".join(example["Moves"][:half_moves])

        template = random.choice(templates_moves_bitboard)
        result = template.format(position=position, previous_moves=previous_moves)

    elif tfm == "uci_result":
        moves = " ".join(example["Moves"])
        result = example["Result"].lower()

        template = random.choice(templates_moves_result)
        result = template.format(moves=moves, result=result)

    elif tfm == "san_result":
        moves = " ".join(uci_to_san(example["Moves"]))
        result = example["Result"].lower()

        template = random.choice(templates_moves_result)
        result = template.format(moves=moves, result=result)

    elif tfm == "lan_result":
        moves = " ".join(uci_to_lan(example["Moves"]))
        result = example["Result"].lower()

        template = random.choice(templates_moves_result)
        result = template.format(moves=moves, result=result)

    elif tfm == "uci_result_termination":
        moves = " ".join(example["Moves"])
        result = example["Result"].lower()
        termination = example["Termination"].lower().replace("_", " ")

        template = random.choice(templates_moves_result_termination)
        result = template.format(moves=moves, result=result, termination=termination)

    elif tfm == "san_result_termination":
        moves = " ".join(uci_to_san(example["Moves"]))
        result = example["Result"].lower()
        termination = example["Termination"].lower().replace("_", " ")

        template = random.choice(templates_moves_result_termination)
        result = template.format(moves=moves, result=result, termination=termination)

    elif tfm == "lan_result_termination":
        moves = " ".join(uci_to_lan(example["Moves"]))
        result = example["Result"].lower()
        termination = example["Termination"].lower().replace("_", " ")

        template = random.choice(templates_moves_result_termination)
        result = template.format(moves=moves, result=result, termination=termination)

    elif tfm == "fen_move_result":
        half_moves = len(example["Moves"]) - 1
        position = uci_to_fen(example["Moves"], max_half_moves=half_moves)
        next_move = example["Moves"][-1]
        result = example["Result"].lower()

        template = random.choice(templates_position_move_result)
        result = template.format(position=position, next_move=next_move, result=result)

    elif tfm == "bitboard_move_result":
        half_moves = len(example["Moves"]) - 1
        position = uci_to_bitboard(example["Moves"], max_half_moves=half_moves)
        next_move = example["Moves"][-1]
        result = example["Result"].lower()

        template = random.choice(templates_position_move_result)
        result = template.format(position=position, next_move=next_move, result=result)

    elif tfm == "fen_move_result_termination":
        half_moves = len(example["Moves"]) - 1
        position = uci_to_fen(example["Moves"], max_half_moves=half_moves)
        next_move = example["Moves"][-1]
        result = example["Result"].lower()
        termination = example["Termination"].lower().replace("_", " ")

        template = random.choice(templates_position_move_result_termination)
        result = template.format(position=position, next_move=next_move, result=result, termination=termination)


    elif tfm == "bitboard_move_result_termination":
        half_moves = len(example["Moves"]) - 1
        position = uci_to_bitboard(example["Moves"], max_half_moves=half_moves)
        next_move = example["Moves"][-1]
        result = example["Result"].lower()
        termination = example["Termination"].lower().replace("_", " ")

        template = random.choice(templates_position_move_result_termination)
        result = template.format(position=position, next_move=next_move, result=result, termination=termination)

    else:
        raise NotImplementedError(f"Transformation {tfm} is not implemented.")

    return result


In [13]:
transform_example(example, fixed_tfm="bitboard_move_result_termination")

"Starting from the position '. . . . . . . .\nk . p . . . . .\n. p . . . . . .\n. . . . . . . Q\n. . . . q . . P\n. K P . . . P .\n. . . . . . . .\n. . . . . . . .' the move e4b1 is played. The game ends due to fivefold repetition with the result 1/2-1/2."

In [14]:
for i in range(50):
    print(transform_example(dataset["train"][i]))
    print("---")

In chess after the moves 'd2-d4 f7-f5 g2-g3 g7-g6 Bf1-g2 Bf8-g7 Ng1-f3 d7-d6 c2-c3 e7-e6 a2-a4 Ng8-f6 Qd1-c2 Qd8-e7 Nb1-d2 e6-e5 d4xe5 d6xe5 e2-e4 Nb8-c6 O-O f5xe4', Nd2xe4 is a good move.
---
Starting from the FEN '6k1/1p3q2/8/p1P2b2/3r1B1p/2P2Q1P/P1P3PK/8 w - - 0 37', a good next move is g5g3
---
After 'd2d4 g8f6 c2c4 e7e5 d4e5 f6g4 c1f4 b8c6 g1f3 f8b4 b1d2 d7d6 e5d6 b4d6 f4d6 d8d6 e2e3 c8f5 f1e2 g4e5 e1g1 e8c8 d1a4 d6g6 f3h4 g6e6 h4f5 d8d2 f5g3 h8d8 b2b4 h7h5 e2h5 e6c4 a2a3 g7g6 a1c1 c4d3 h5d1 e5c4 a4b3 b7b5 a3a4 a7a6 b3d3 d8d3 d1g4 f7f5 a4b5 a6b5 g3f5 g6f5 g4f5 c8b7 f5d3 d2d3 g2g4 c6b4 f1d1 c4e5 g4g5 e5f3 g1f1 f3g5 h2h4 g5f7 d1d3 b4d3 c1b1 b7c6 f1e2 d3c5 f2f3 c5d7 h4h5 d7f6 b1h1 c6d6 h5h6 f6h7 e3e4 b5b4 f3f4 b4b3 e2d3 d6e6 h1g1 h7f6 e4e5 f6d5 d3e4 d5c3 e4d3 c3d5 h6h7 d5f4 d3d2 f7h8 g1g7 c7c5 g7c7 e6e5 c7c5 e5f6 c5c8 f6g7 c8h8 b3b2 d2c2 f4d5 c2b2 d5f6 h8g8 g7h7 g8g7 h7h6 g7g6 h6g6' the game ends due to insufficient material with the result 1/2-1/2.
---
In a game of chess, after 'c2c