In [26]:
import chess
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
import tqdm
from chess import Board


In [27]:
board = Board()
moves = set()
turn = chess.WHITE
for piece_type in chess.PIECE_TYPES:
    if piece_type != chess.PAWN:
        piece = chess.Piece(piece_type, turn)
        for i in range(64):
            board.set_piece_map({i: piece})
            board.turn = turn
            for move in board.generate_pseudo_legal_moves():
                moves.add(board.lan(move))

In [28]:
len(moves)

3668

In [29]:
board = Board()
pawn_moves = set()
turn = chess.WHITE
turn2 = chess.BLACK

piece = chess.Piece(chess.PAWN, turn)
for i in range(8,56, 8):
    board.set_piece_map(dict((j, piece) for j in range(i, i+8)))
    board.turn = turn
    #display(board)
    for move in board.generate_pseudo_legal_moves():
        pawn_moves.add(board.lan(move))

piece2 = chess.Piece(chess.ROOK, turn2)
for i in range(8,56, 8):
    board.set_piece_map(dict((j, piece) for j in range(i, i+8)) | dict((j, piece2) for j in range(i+8, i+16)))
    board.turn = turn
    #display(board)
    for move in board.generate_pseudo_legal_moves():
        pawn_moves.add(board.lan(move))

piece = chess.Piece(chess.PAWN, turn2)
for i in range(8,56, 8):
    board.set_piece_map(dict((j, piece) for j in range(i, i+8)))
    board.turn = turn2
    #display(board)
    for move in board.generate_pseudo_legal_moves():
        pawn_moves.add(board.lan(move))

piece2 = chess.Piece(chess.ROOK, turn)
for i in range(0,48, 8):
    board.set_piece_map(dict((j, piece2) for j in range(i, i+8)) | dict((j, piece) for j in range(i+8, i+16)))
    board.turn = turn2
    #display(board)
    for move in board.generate_pseudo_legal_moves():
        pawn_moves.add(board.lan(move))

In [30]:
len(pawn_moves)

412

In [31]:
all_moves = list()
takes = ['-', 'x']
specials = ['', '+', "#"]
for move in moves:
    for take in takes:
        for special in specials:
            all_moves.append(move.replace("-", take) + special)

for move in pawn_moves:
    for special in specials:
        all_moves.append(move + special)

castles = ['O-O', 'O-O+', 'O-O#', 'O-O-O', 'O-O-O+', 'O-O-O#']
all_moves += castles

In [32]:
all_moves

['Bd2-b4',
 'Bd2-b4+',
 'Bd2-b4#',
 'Bd2xb4',
 'Bd2xb4+',
 'Bd2xb4#',
 'Rg6-g1',
 'Rg6-g1+',
 'Rg6-g1#',
 'Rg6xg1',
 'Rg6xg1+',
 'Rg6xg1#',
 'Qe5-a5',
 'Qe5-a5+',
 'Qe5-a5#',
 'Qe5xa5',
 'Qe5xa5+',
 'Qe5xa5#',
 'Qd7-e6',
 'Qd7-e6+',
 'Qd7-e6#',
 'Qd7xe6',
 'Qd7xe6+',
 'Qd7xe6#',
 'Rd2-d1',
 'Rd2-d1+',
 'Rd2-d1#',
 'Rd2xd1',
 'Rd2xd1+',
 'Rd2xd1#',
 'Bc8-a6',
 'Bc8-a6+',
 'Bc8-a6#',
 'Bc8xa6',
 'Bc8xa6+',
 'Bc8xa6#',
 'Kg2-f2',
 'Kg2-f2+',
 'Kg2-f2#',
 'Kg2xf2',
 'Kg2xf2+',
 'Kg2xf2#',
 'Bc5-a7',
 'Bc5-a7+',
 'Bc5-a7#',
 'Bc5xa7',
 'Bc5xa7+',
 'Bc5xa7#',
 'Qf3-f4',
 'Qf3-f4+',
 'Qf3-f4#',
 'Qf3xf4',
 'Qf3xf4+',
 'Qf3xf4#',
 'Bc4-b3',
 'Bc4-b3+',
 'Bc4-b3#',
 'Bc4xb3',
 'Bc4xb3+',
 'Bc4xb3#',
 'Be7-f8',
 'Be7-f8+',
 'Be7-f8#',
 'Be7xf8',
 'Be7xf8+',
 'Be7xf8#',
 'Kb7-b8',
 'Kb7-b8+',
 'Kb7-b8#',
 'Kb7xb8',
 'Kb7xb8+',
 'Kb7xb8#',
 'Nd1-b2',
 'Nd1-b2+',
 'Nd1-b2#',
 'Nd1xb2',
 'Nd1xb2+',
 'Nd1xb2#',
 'Bf5-c8',
 'Bf5-c8+',
 'Bf5-c8#',
 'Bf5xc8',
 'Bf5xc8+',
 'Bf5xc8#',
 'Kc5-d5',
 'Kc5-d5+

In [33]:
len(all_moves)

23250

In [37]:
with open("moves.txt", 'w') as movef:
    for move in all_moves:
        movef.write(move + "\n")

In [52]:
tokens = ["<w>", "<b>", "<eos>"] + all_moves

In [53]:
tokenizer = Tokenizer(models.WordLevel(unk_token="<unk>"))

In [54]:
tokenizer.add_special_tokens(tokens)

23253

In [55]:
tokenizer.save("model/tokenizer.model")