In [1]:
import chess
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
import tqdm
from chess import Board


In [2]:
board = Board()
moves = list()
turn = chess.WHITE
for piece_type in chess.PIECE_TYPES:
    if piece_type != chess.PAWN:
        piece = chess.Piece(piece_type, turn)
        for i in range(64):
            board.set_piece_map({i: piece})
            board.turn = turn
            for move in board.generate_pseudo_legal_moves():
                moves.append(board.lan(move))

In [3]:
len(moves)

3668

In [4]:
board = Board()
pawn_moves = list()
turn = chess.WHITE
turn2 = chess.BLACK

piece = chess.Piece(chess.PAWN, turn)
for i in range(8,56, 8):
    board.set_piece_map(dict((j, piece) for j in range(i, i+8)))
    board.turn = turn
    #display(board)
    for move in board.generate_pseudo_legal_moves():
        pawn_moves.append(board.lan(move))

piece2 = chess.Piece(chess.ROOK, turn2)
for i in range(8,56, 8):
    board.set_piece_map(dict((j, piece) for j in range(i, i+8)) | dict((j, piece2) for j in range(i+8, i+16)))
    board.turn = turn
    #display(board)
    for move in board.generate_pseudo_legal_moves():
        pawn_moves.append(board.lan(move))

piece = chess.Piece(chess.PAWN, turn2)
for i in range(8,56, 8):
    board.set_piece_map(dict((j, piece) for j in range(i, i+8)))
    board.turn = turn2
    #display(board)
    for move in board.generate_pseudo_legal_moves():
        pawn_moves.append(board.lan(move))

piece2 = chess.Piece(chess.ROOK, turn)
for i in range(0,48, 8):
    board.set_piece_map(dict((j, piece2) for j in range(i, i+8)) | dict((j, piece) for j in range(i+8, i+16)))
    board.turn = turn2
    #display(board)
    for move in board.generate_pseudo_legal_moves():
        pawn_moves.append(board.lan(move))

In [5]:
len(pawn_moves)

412

In [6]:
all_moves = list()
takes = ['-', 'x']
specials = ['', '+', "#"]
for move in moves:
    for take in takes:
        for special in specials:
            all_moves.append(move.replace("-", take) + special)

for move in pawn_moves:
    for special in specials:
        all_moves.append(move + special)

castles = ['O-O', 'O-O+', 'O-O#', 'O-O-O', 'O-O-O+', 'O-O-O#']
all_moves += castles

In [7]:
all_moves

['Na1-b3',
 'Na1-b3+',
 'Na1-b3#',
 'Na1xb3',
 'Na1xb3+',
 'Na1xb3#',
 'Na1-c2',
 'Na1-c2+',
 'Na1-c2#',
 'Na1xc2',
 'Na1xc2+',
 'Na1xc2#',
 'Nb1-c3',
 'Nb1-c3+',
 'Nb1-c3#',
 'Nb1xc3',
 'Nb1xc3+',
 'Nb1xc3#',
 'Nb1-a3',
 'Nb1-a3+',
 'Nb1-a3#',
 'Nb1xa3',
 'Nb1xa3+',
 'Nb1xa3#',
 'Nb1-d2',
 'Nb1-d2+',
 'Nb1-d2#',
 'Nb1xd2',
 'Nb1xd2+',
 'Nb1xd2#',
 'Nc1-d3',
 'Nc1-d3+',
 'Nc1-d3#',
 'Nc1xd3',
 'Nc1xd3+',
 'Nc1xd3#',
 'Nc1-b3',
 'Nc1-b3+',
 'Nc1-b3#',
 'Nc1xb3',
 'Nc1xb3+',
 'Nc1xb3#',
 'Nc1-e2',
 'Nc1-e2+',
 'Nc1-e2#',
 'Nc1xe2',
 'Nc1xe2+',
 'Nc1xe2#',
 'Nc1-a2',
 'Nc1-a2+',
 'Nc1-a2#',
 'Nc1xa2',
 'Nc1xa2+',
 'Nc1xa2#',
 'Nd1-e3',
 'Nd1-e3+',
 'Nd1-e3#',
 'Nd1xe3',
 'Nd1xe3+',
 'Nd1xe3#',
 'Nd1-c3',
 'Nd1-c3+',
 'Nd1-c3#',
 'Nd1xc3',
 'Nd1xc3+',
 'Nd1xc3#',
 'Nd1-f2',
 'Nd1-f2+',
 'Nd1-f2#',
 'Nd1xf2',
 'Nd1xf2+',
 'Nd1xf2#',
 'Nd1-b2',
 'Nd1-b2+',
 'Nd1-b2#',
 'Nd1xb2',
 'Nd1xb2+',
 'Nd1xb2#',
 'Ne1-f3',
 'Ne1-f3+',
 'Ne1-f3#',
 'Ne1xf3',
 'Ne1xf3+',
 'Ne1xf3#',
 'Ne1-d3',
 'Ne1-d3+

In [8]:
len(all_moves)

23250

In [9]:
with open("moves.txt", 'w') as movef:
    for move in all_moves:
        movef.write(move + "\n")

In [10]:
tokens =  all_moves
special_tokens = ["<w>", "<b>", "<eos>"]

In [11]:
# import json

# with open("vocab.json", 'w') as outfile:
#     json.dump(vocab, outfile)

In [18]:
tokenizer = Tokenizer(models.WordLevel(unk_token="<unk>"))

In [19]:
tokenizer.enable_padding(pad_token="<pad>", pad_id=23253)
tokenizer.add_special_tokens(special_tokens)
tokenizer.add_tokens(tokens)

23250

In [20]:
tokenizer.save("model/tokenizer.model")

In [21]:
tokenizer = tokenizer.from_file("model/tokenizer.model")

In [22]:
tokenizer.encode("O-O<eos>").ids

[23247, 2]