In [1]:
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerFast
import matplotlib.pyplot as plt
from tqdm import tqdm

In [5]:
SLICE_TOKENIZER_PATH = "/home/so87pot/n0w0f/regression-transformer/slice-assets/slice_vocab.txt"

In [6]:

class SliceTokenizer:
    def __init__(self, vocab_file=SLICE_TOKENIZER_PATH):
        _tokenizer = Tokenizer.from_file(vocab_file)
        self.tokenizer = PreTrainedTokenizerFast(
                    tokenizer_object=_tokenizer,
                    unk_token="[UNK]",
                    pad_token="[PAD]",
                    cls_token="[CLS]",
                    sep_token="[SEP]",
                    mask_token="[MASK]",
                )
    def tokenize(self, text):
        return self.tokenizer.tokenize(text)

In [41]:
Ga Ga Ge Ge Te Te 0 5 - - o 0 5 - o - 0 5 o - - 0 2 o o o 1 3 o o o 1 4 o + + 1 4 + o + 1 4 + + o 2 3 - o o 2 3 o - o 2 3 o o - 
Li Li Co Si O O O O 0 4 o - o 0 7 + + o 0 5 o o o 0 6 o - o 1 6 o - o 1 5 o + o 1 7 o + o 1 4 - o o 2 5 o - + 2 4 o + o 2 6 o o o 2 7 o o + 3 7 o - + 3 6 o o o 3 4 - o o 3 5 - - + 
F F F F F F Y Cs Cs Cs 0 6 + o o 0 7 o - - 0 9 o o o 1 6 o + o 1 7 o o o 1 9 - o - 2 6 + o + 2 7 + o + 2 9 o o o 3 6 o + o 3 7 o o - 3 9 o + o 4 6 o o o 4 8 o o + 5 6 + + + 5 8 o o o 

In [1]:
elements = [
    'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne',
    'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ar',
    'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Ni', 'Co', 'Cu', 'Zn',
    'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo',
    'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe',
    'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho',
    'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg',
    'Tl', 'Pb', 'Bi', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es',
    'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn',
    'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og'
]

from itertools import product

# Define symbols
symbols = ['o', '+', '-']

# Generate all combinations of length 3
combinations = [' '.join(combination) for combination in product(symbols, repeat=3)]

# Numbers
numbers = [str(i) for i in range(10)]

# Combine all elements, symbols, and numbers
all_tokens = combinations + elements + numbers

# Path to the vocabulary file
vocab_file_path = 'extended_periodic_table_vocab.txt'

# Write all tokens to the vocabulary file
with open(vocab_file_path, 'w', encoding='utf-8') as file:
    file.write('\n'.join(all_tokens))

print(f"Vocabulary file '{vocab_file_path}' created successfully.")

Vocabulary file 'extended_periodic_table_vocab.txt' created successfully.


In [32]:
from transformers import PreTrainedTokenizer
import os
import re

class AtomVocabTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file, model_max_length=None, **kwargs):
        super(AtomVocabTokenizer, self).__init__(model_max_length=model_max_length, **kwargs)
        
        # Load vocabulary from the provided file
        self.vocab = self.load_vocab(vocab_file)
        elements = [
            'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne',
            'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ar',
            'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Ni', 'Co', 'Cu', 'Zn',
            'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo',
            'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe',
            'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho',
            'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg',
            'Tl', 'Pb', 'Bi', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es',
            'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn',
            'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og'
        ]

        from itertools import product

        # Define symbols
        symbols = ['o', '+', '-']

        # Generate all combinations of length 3
        combinations = [' '.join(combination) for combination in product(symbols, repeat=3)]

        # Numbers
        numbers = [str(i) for i in range(10)]

        # Combine all elements, symbols, and numbers
        self.all_tokens = combinations + elements + numbers

    def load_vocab(self, vocab_file):
        with open(vocab_file, 'r', encoding='utf-8') as file:
            vocab = file.read().splitlines()
        return {token: idx for idx, token in enumerate(vocab)}
        
        
        
    def tokenize(self, text):
        # List of tokens
        tokens = self.all_tokens

        # Escape special characters in the vocab to ensure they are treated as literals in the regex
        escaped_tokens = [re.escape(token) for token in tokens]

        # Join the escaped vocab terms into a regex pattern
        pattern_str = '|'.join(escaped_tokens)
        pattern = re.compile(pattern_str)

        # Find all matches in the text
        matches = pattern.findall(text)
        return tokens

    def convert_tokens_to_string(self, tokens):
        return ' '.join(tokens)

    def _add_tokens(self, new_tokens, **kwargs):
        # Override _add_tokens to add new tokens to the vocabulary
        for token in new_tokens:
            if token not in self.added_tokens_encoder:
                self.vocab[token] = len(self.vocab)
                self.ids_to_tokens[len(self.ids_to_tokens)] = token

    def _convert_token_to_id(self, token):
        return self.vocab.get(token, self.vocab.get(self.unk_token))

    def _convert_id_to_token(self, index):
        return list(self.vocab.keys())[index]

    def save_vocabulary(self, vocab_path):
        with open(vocab_path, 'w', encoding='utf-8') as file:
            file.write('\n'.join(self.vocab))

# Example usage
vocab_file_path = '/home/so87pot/n0w0f/structllm/notebooks/extended_periodic_table_vocab.txt'
tokenizer = AtomVocabTokenizer(vocab_file_path)

input_string = "F F F F K K Zn 0 6 o o o 0 6 + o o 0 4 o o - 0 4 o - - 0 5 o o o 0 5 o - o 1 6 o + o 1 6 o o o 1 4 o o - 1 4 - o - 1 5 o o o 1 5 - o o 2 6 + + + 2 4 + + o 2 4 + o o 2 4 o + o 2 4 o o o 2 5 o o o 3 6 o o o 3 4 o o o 3 5 o o o 3 5 o - o 3 5 - o o 3 5 - - o "
tokens = tokenizer.tokenize(input_string)
print("Tokens:", tokens)


Tokens: ['o o o', 'o o +', 'o o -', 'o + o', 'o + +', 'o + -', 'o - o', 'o - +', 'o - -', '+ o o', '+ o +', '+ o -', '+ + o', '+ + +', '+ + -', '+ - o', '+ - +', '+ - -', '- o o', '- o +', '- o -', '- + o', '- + +', '- + -', '- - o', '- - +', '- - -', 'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ar', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Ni', 'Co', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


In [39]:
tokenizer.encode("N F N F N")

[0, 1, 0, 1, 0]

In [37]:
import re
text = "F F F F K K Zn 0 6 o o o 0 6 + o o 0 4 o o - 0 4 o - - 0 5 o o o 0 5 o - o 1 6 o + o 1 6 o o o 1 4 o o - 1 4 - o - 1 5 o o o 1 5 - o o 2 6 + + + 2 4 + + o 2 4 + o o 2 4 o + o 2 4 o o o 2 5 o o o 3 6 o o o 3 4 o o o 3 5 o o o 3 5 o - o 3 5 - o o 3 5 - - o "

# List of tokens
tokens = ['o o o', 'o o +', 'o o -', 'o + o', 'o + +', 'o + -', 'o - o', 'o - +', 'o - -',
          '+ o o', '+ o +', '+ o -', '+ + o', '+ + +', '+ + -', '+ - o', '+ - +', '+ - -',
          '- o o', '- o +', '- o -', '- + o', '- + +', '- + -', '- - o', '- - +', '- - -',
          'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ar', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe']

# Escape special characters in the vocab to ensure they are treated as literals in the regex
escaped_tokens = [re.escape(token) for token in tokens]

# Join the escaped vocab terms into a regex pattern
pattern_str = '|'.join(escaped_tokens)
pattern = re.compile(pattern_str)

# Find all matches in the text
matches = pattern.findall(text)

print(matches)


['F', 'F', 'F', 'F', 'K', 'K', 'o o o', '+ o o', 'o o -', 'o - -', 'o o o', 'o - o', 'o + o', 'o o o', 'o o -', '- o -', 'o o o', '- o o', '+ + +', '+ + o', '+ o o', 'o + o', 'o o o', 'o o o', 'o o o', 'o o o', 'o o o', 'o - o', '- o o', '- - o']


In [31]:
import re

# List of tokens
tokens = ['o o o', 'o o +', 'o o -', 'o + o', 'o + +', 'o + -', 'o - o', 'o - +', 'o - -',
          '+ o o', '+ o +', '+ o -', '+ + o', '+ + +', '+ + -', '+ - o', '+ - +', '+ - -',
          '- o o', '- o +', '- o -', '- + o', '- + +', '- + -', '- - o', '- - +', '- - -',
          'H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'K', 'Ar', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe']

# Escape special characters in the vocab to ensure they are treated as literals in the regex
escaped_tokens = [re.escape(token) for token in tokens]

# Join the escaped vocab terms into a regex pattern, allowing for spaces
pattern_str = r'(?:' + '|'.join(escaped_tokens) + r')'
pattern = re.compile(pattern_str)

# Test the pattern on a sample text
text = "P P P P P P P P 0 5 - o o 0 6 - o o 0 6 - o + 0 3 o o o 1 2 o o o 3 5 - o o 3 5 - o + 3 6 - o + 4 7 o o o 5 6 o o o"
matches = pattern.findall(text)
print(matches)


['P', 'P', 'P', 'P', 'P', 'P', 'P', 'P', '- o o', '- o o', '- o +', 'o o o', 'o o o', '- o o', '- o +', '- o +', 'o o o', 'o o o']


In [40]:
tokenizer.decode([0, 1, 0, 1, 0])

'N F N F N'

In [15]:
from transformers import PreTrainedTokenizer
import os

class AtomVocabTokenizer(PreTrainedTokenizer):
    def __init__(self, vocab_file, model_max_length=None, **kwargs):
        super(AtomVocabTokenizer, self).__init__(model_max_length=model_max_length, **kwargs)
        
        # Load vocabulary from the provided file
        self.vocab = self.load_vocab(vocab_file)

    def load_vocab(self, vocab_file):
        with open(vocab_file, 'r', encoding='utf-8') as file:
            vocab = file.read().splitlines()
        return vocab

    def tokenize(self, text):
        tokens = []
        for char in text:
            tokens.append(char) if char in self.vocab else tokens.append(self.unk_token)
        return tokens
    
    def _add_tokens(self):
        # Override _add_tokens to prevent NotImplementedError
        pass

    def convert_tokens_to_string(self, tokens):
        return ''.join(tokens)

    def save_vocabulary(self, vocab_path):
        with open(vocab_path, 'w', encoding='utf-8') as file:
            file.write('\n'.join(self.vocab))



In [None]:
decoded_string = tokenizer.convert_tokens_to_string(tokens)
print("Decoded String:", decoded_string)

In [None]:
decoded_string = tokenizer.convert_tokens_to_string(tokens)
print("Decoded String:", decoded_string)

# Save the custom vocabulary
output_vocab_path = 'path/to/your/output/vocab.txt'
tokenizer.save_vocabulary(output_vocab_path)
