In [1]:
import chess
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
import tqdm
from chess import Board


In [2]:
board = Board()
moves = set()
turn = chess.WHITE
for piece_type in chess.PIECE_TYPES:
    if piece_type != chess.PAWN:
        piece = chess.Piece(piece_type, turn)
        for i in range(64):
            board.set_piece_map({i: piece})
            board.turn = turn
            for move in board.generate_pseudo_legal_moves():
                moves.add(board.lan(move))

In [3]:
len(moves)

3668

In [4]:
board = Board()
pawn_moves = set()
turn = chess.WHITE
turn2 = chess.BLACK

piece = chess.Piece(chess.PAWN, turn)
for i in range(8,56, 8):
    board.set_piece_map(dict((j, piece) for j in range(i, i+8)))
    board.turn = turn
    #display(board)
    for move in board.generate_pseudo_legal_moves():
        pawn_moves.add(board.lan(move))

piece2 = chess.Piece(chess.ROOK, turn2)
for i in range(8,56, 8):
    board.set_piece_map(dict((j, piece) for j in range(i, i+8)) | dict((j, piece2) for j in range(i+8, i+16)))
    board.turn = turn
    #display(board)
    for move in board.generate_pseudo_legal_moves():
        pawn_moves.add(board.lan(move))

piece = chess.Piece(chess.PAWN, turn2)
for i in range(8,56, 8):
    board.set_piece_map(dict((j, piece) for j in range(i, i+8)))
    board.turn = turn2
    #display(board)
    for move in board.generate_pseudo_legal_moves():
        pawn_moves.add(board.lan(move))

piece2 = chess.Piece(chess.ROOK, turn)
for i in range(0,48, 8):
    board.set_piece_map(dict((j, piece2) for j in range(i, i+8)) | dict((j, piece) for j in range(i+8, i+16)))
    board.turn = turn2
    #display(board)
    for move in board.generate_pseudo_legal_moves():
        pawn_moves.add(board.lan(move))

In [5]:
len(pawn_moves)

412

In [6]:
all_moves = list()
takes = ['-', 'x']
specials = ['', '+', "#"]
for move in moves:
    for take in takes:
        for special in specials:
            all_moves.append(move.replace("-", take) + special)

for move in pawn_moves:
    for special in specials:
        all_moves.append(move + special)

castles = ['O-O', 'O-O+', 'O-O#', 'O-O-O', 'O-O-O+', 'O-O-O#']
all_moves += castles

In [7]:
all_moves

['Nc8-d6',
 'Nc8-d6+',
 'Nc8-d6#',
 'Nc8xd6',
 'Nc8xd6+',
 'Nc8xd6#',
 'Ra8-b8',
 'Ra8-b8+',
 'Ra8-b8#',
 'Ra8xb8',
 'Ra8xb8+',
 'Ra8xb8#',
 'Rh1-h8',
 'Rh1-h8+',
 'Rh1-h8#',
 'Rh1xh8',
 'Rh1xh8+',
 'Rh1xh8#',
 'Re3-e7',
 'Re3-e7+',
 'Re3-e7#',
 'Re3xe7',
 'Re3xe7+',
 'Re3xe7#',
 'Rb7-b2',
 'Rb7-b2+',
 'Rb7-b2#',
 'Rb7xb2',
 'Rb7xb2+',
 'Rb7xb2#',
 'Ng7-f5',
 'Ng7-f5+',
 'Ng7-f5#',
 'Ng7xf5',
 'Ng7xf5+',
 'Ng7xf5#',
 'Qb2-b1',
 'Qb2-b1+',
 'Qb2-b1#',
 'Qb2xb1',
 'Qb2xb1+',
 'Qb2xb1#',
 'Qc5-d5',
 'Qc5-d5+',
 'Qc5-d5#',
 'Qc5xd5',
 'Qc5xd5+',
 'Qc5xd5#',
 'Qb5-b2',
 'Qb5-b2+',
 'Qb5-b2#',
 'Qb5xb2',
 'Qb5xb2+',
 'Qb5xb2#',
 'Qb7-e7',
 'Qb7-e7+',
 'Qb7-e7#',
 'Qb7xe7',
 'Qb7xe7+',
 'Qb7xe7#',
 'Qh7-h5',
 'Qh7-h5+',
 'Qh7-h5#',
 'Qh7xh5',
 'Qh7xh5+',
 'Qh7xh5#',
 'Ke4-f4',
 'Ke4-f4+',
 'Ke4-f4#',
 'Ke4xf4',
 'Ke4xf4+',
 'Ke4xf4#',
 'Qb1-h7',
 'Qb1-h7+',
 'Qb1-h7#',
 'Qb1xh7',
 'Qb1xh7+',
 'Qb1xh7#',
 'Kb5-c6',
 'Kb5-c6+',
 'Kb5-c6#',
 'Kb5xc6',
 'Kb5xc6+',
 'Kb5xc6#',
 'Kg4-f4',
 'Kg4-f4+

In [8]:
len(all_moves)

23250

In [9]:
with open("moves.txt", 'w') as movef:
    for move in all_moves:
        movef.write(move + "\n")

In [10]:
tokens =  all_moves
special_tokens = ["<w>", "<b>", "<eos>"]

In [15]:
from transformers import Tokenizer

In [12]:
tokenizer = Tokenizer(models.WordLevel(unk_token="<unk>"))
tokenizer.add_special_tokens(tokens)

23250

In [13]:
vocab = tokenizer.get_vocab()
vocab

{'Rb2xb6#': 5201,
 'Qa5xb6+': 4192,
 'Na3xb1': 13407,
 'Rh8-h1': 14784,
 'Kh5-h6#': 13688,
 'Rc4-f4#': 11294,
 'Rg1-g5+': 14851,
 'c7-c8=B': 22464,
 'Qg5xb5': 21069,
 'Ka7-b6+': 9733,
 'Kb4-a3': 11094,
 'Qc4xe2#': 17723,
 'Re6-e5': 7242,
 'Qa7-a5#': 3920,
 'Bg6-d3+': 8401,
 'Qf7-a7': 10788,
 'c2xb1=R+': 22951,
 'Qd3xd4': 16521,
 'Qg7xa7#': 5015,
 'Ra8xg8+': 12976,
 'g4xh5#': 23117,
 'Rd6-d5+': 12871,
 'Qd1xd5+': 17626,
 'Qh3-e3': 9912,
 'Rb8xf8#': 9197,
 'Qd2xh2+': 15586,
 'Qd7xd2': 4095,
 'Ra5-a3': 888,
 'Qc6-e8+': 12007,
 'e2-e1=N#': 23060,
 'Qg6-g5': 20712,
 'Qa7-h7+': 6343,
 'Bf1-d3+': 8395,
 'f3-f2+': 22567,
 'Qe4-e1': 6690,
 'Qb4-b1#': 9674,
 'Bh1xf3#': 20045,
 'Ra4-a3': 10266,
 'Kh3-g3#': 16004,
 'Nf2-h3+': 19975,
 'Ne3xf1': 20829,
 'Qh1xa1': 4479,
 'Qf7xe7': 1395,
 'Qb1xb8#': 10601,
 'Rg8xa8+': 14212,
 'Qc2-d1': 17544,
 'Kf8xe8+': 2344,
 'Qa1-f6': 10962,
 'Qh1-a1#': 4478,
 'Rf7-g7+': 3721,
 'Qh5xe8+': 15052,
 'Qg6-f6#': 21110,
 'Nd2xe4+': 1138,
 'Rg1xf1': 16683,
 'Qd6-d2': 5346

In [14]:
import json

with open("vocab.json", 'w') as outfile:
    json.dump(vocab, outfile)

In [38]:
tokenizer = Tokenizer(models.WordLevel(vocab=vocab, unk_token="<unk>"))

TypeError: Tokenizer.__new__() got an unexpected keyword argument 'special_tokens'

In [23]:
tokenizer.add_special_tokens(special_tokens)

3

In [25]:
tokenizer.save("model/tokenizer.model")

In [35]:
tokenizer = tokenizer.from_file("model/tokenizer.model")

In [36]:
tokenizer.encode("O-O<eos>").ids

[23244, 23252]