In [1]:
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast
import matplotlib.pyplot as plt
from tqdm import tqdm

In [5]:
SLICE_TOKENIZER_PATH = "/home/so87pot/n0w0f/regression-transformer/slice-assets/slice_vocab.txt"

In [6]:

class SliceTokenizer:
    def __init__(self, vocab_file=SLICE_TOKENIZER_PATH):
        _tokenizer = Tokenizer.from_file(vocab_file)
        self.tokenizer = PreTrainedTokenizerFast(
                    tokenizer_object=_tokenizer,
                    unk_token="[UNK]",
                    pad_token="[PAD]",
                    cls_token="[CLS]",
                    sep_token="[SEP]",
                    mask_token="[MASK]",
                )
    def tokenize(self, text):
        return self.tokenizer.tokenize(text)

In [45]:
from structllm.tokenizer.slice_tokenizer import AtomVocabTokenizer

In [47]:
# Example usage
vocab_file_path = '/home/so87pot/n0w0f/structllm/notebooks/extended_periodic_table_vocab.txt'
tokenizer = AtomVocabTokenizer(vocab_file_path)

input_string = "F F F F K K Zn 0 6 o o o 0 6 + o o 0 4 - 1 4 - o - 1 5 o o o 1 5 - o o 2 6 + + + 2 4 + + o 2 4 + o o 2 4 o + o 2 4 o o o 2 5 o o o 3 6 o o o 3 4 o o o 3 5 o o o 3 5 o - o 3 5 - o o 3 5 - - o "
tokens = tokenizer.tokenize(input_string)
print("Tokens:", tokens)
print("len of Tokens:", len(tokens))

Tokens: ['F', 'F', 'F', 'F', 'K', 'K', 'Zn', '0', '6', 'o o o', '0', '6', '+ o o', '0', '4', '1', '4', '- o -', '1', '5', 'o o o', '1', '5', '- o o', '2', '6', '+ + +', '2', '4', '+ + o', '2', '4', '+ o o', '2', '4', 'o + o', '2', '4', 'o o o', '2', '5', 'o o o', '3', '6', 'o o o', '3', '4', 'o o o', '3', '5', 'o o o', '3', '5', 'o - o', '3', '5', '- o o', '3', '5', '- - o']
len of Tokens: 60


In [49]:
tokenizer.decode(tokenizer.encode("Ga Ga Ga Ga Ga Ga Ga Ga 0 4 o o o 1 5 o o o 2 6 o o o 3 7 o o o "))

2024-01-12 16:28:10.816882: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-12 16:28:10.956627: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-12 16:28:13.110270: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


'Ga Ga Ga Ga Ga Ga Ga Ga 0 4 o o o 1 5 o o o 2 6 o o o 3 7 o o o'

In [44]:
from transformers import PreTrainedTokenizer
import os
import re

class AtomVocabTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file, model_max_length=None, **kwargs):
        super(AtomVocabTokenizer, self).__init__(model_max_length=model_max_length, **kwargs)
        
        # Load vocabulary from the provided file
        self.vocab = self.load_vocab(vocab_file)
        

    def load_vocab(self, vocab_file):
        with open(vocab_file, 'r', encoding='utf-8') as file:
            vocab = file.read().splitlines()
        return {token: idx for idx, token in enumerate(vocab)}
        
        
        
    def tokenize(self, text):
        # List of tokens
        tokens = list(self.vocab.keys())

        # Escape special characters in the vocab to ensure they are treated as literals in the regex
        escaped_tokens = [re.escape(token) for token in tokens]

        # Join the escaped vocab terms into a regex pattern
        pattern_str = '|'.join(escaped_tokens)
        pattern = re.compile(pattern_str)

        # Find all matches in the text
        matches = pattern.findall(text)
        return matches

    def convert_tokens_to_string(self, tokens):
        return ' '.join(tokens)

    def _add_tokens(self, new_tokens, **kwargs):
        # Override _add_tokens to add new tokens to the vocabulary
        for token in new_tokens:
            if token not in self.added_tokens_encoder:
                self.vocab[token] = len(self.vocab)
                self.ids_to_tokens[len(self.ids_to_tokens)] = token

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index):
        return list(self.vocab.keys())[index]

    def save_vocabulary(self, vocab_path):
        with open(vocab_path, 'w', encoding='utf-8') as file:
            file.write('\n'.join(self.vocab))

# Example usage
vocab_file_path = '/home/so87pot/n0w0f/structllm/notebooks/extended_periodic_table_vocab.txt'
tokenizer = AtomVocabTokenizer(vocab_file_path)

input_string = "F F F F K K Zn 0 6 o o o 0 6 + o o 0 4 o o - 0 4 o - - 0 5 o o o 0 5 o - o 1 6 o + o 1 6 o o o 1 4 o o - 1 4 - o - 1 5 o o o 1 5 - o o 2 6 + + + 2 4 + + o 2 4 + o o 2 4 o + o 2 4 o o o 2 5 o o o 3 6 o o o 3 4 o o o 3 5 o o o 3 5 o - o 3 5 - o o 3 5 - - o "
tokens = tokenizer.tokenize(input_string)
print("Tokens:", tokens)
print("len of Tokens:", len(tokens))


Tokens: ['F', 'F', 'F', 'F', 'K', 'K', 'Zn', '0', '6', 'o o o', '0', '6', '+ o o', '0', '4', 'o o -', '0', '4', 'o - -', '0', '5', 'o o o', '0', '5', 'o - o', '1', '6', 'o + o', '1', '6', 'o o o', '1', '4', 'o o -', '1', '4', '- o -', '1', '5', 'o o o', '1', '5', '- o o', '2', '6', '+ + +', '2', '4', '+ + o', '2', '4', '+ o o', '2', '4', 'o + o', '2', '4', 'o o o', '2', '5', 'o o o', '3', '6', 'o o o', '3', '4', 'o o o', '3', '5', 'o o o', '3', '5', 'o - o', '3', '5', '- o o', '3', '5', '- - o']
len of Tokens: 79


In [None]:
# Example usage
vocab_file_path = '/home/so87pot/n0w0f/structllm/notebooks/extended_periodic_table_vocab.txt'
tokenizer = AtomVocabTokenizer(vocab_file_path)

input_string = "F F F F K K Zn 0 6 o o o 0 6 + o o 0 4 o o - 0 4 o - - 0 5 o o o 0 5 o - o 1 6 o + o 1 6 o o o 1 4 o o - 1 4 - o - 1 5 o o o 1 5 - o o 2 6 + + + 2 4 + + o 2 4 + o o 2 4 o + o 2 4 o o o 2 5 o o o 3 6 o o o 3 4 o o o 3 5 o o o 3 5 o - o 3 5 - o o 3 5 - - o "
tokens = tokenizer.tokenize(input_string)
print("Tokens:", tokens)
print("len of Tokens:", len(tokens))
