In [1]:
from tokenizers import AddedToken
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def extend_tokenizer(tokenizer):
    # add special tokens for the years
    tokens = [
        AddedToken(f"<s_{year}>", single_word=True, lstrip=True, rstrip=True)
        for year in ["2014", "2016", "2018", "2020", "2022", "2024"]
    ]
    tokenizer.add_tokens(tokens, special_tokens=True)

    print("Added the following tokens:")
    print(tokens)
    return tokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False)
tokenizer = extend_tokenizer(tokenizer)

Added the following tokens:
[AddedToken("<s_2014>", rstrip=True, lstrip=True, single_word=True, normalized=True, special=True), AddedToken("<s_2016>", rstrip=True, lstrip=True, single_word=True, normalized=True, special=True), AddedToken("<s_2018>", rstrip=True, lstrip=True, single_word=True, normalized=True, special=True), AddedToken("<s_2020>", rstrip=True, lstrip=True, single_word=True, normalized=True, special=True), AddedToken("<s_2022>", rstrip=True, lstrip=True, single_word=True, normalized=True, special=True), AddedToken("<s_2024>", rstrip=True, lstrip=True, single_word=True, normalized=True, special=True)]


In [15]:
tokenizer.get_vocab()['<s_2014>']

32000

In [17]:
tokens = {y : tokenizer.get_vocab()[f'<s_{y}>'] for y in ["2014", "2016", "2018", "2020", "2022", "2024"]}
tokens

{'2014': 32000,
 '2016': 32001,
 '2018': 32002,
 '2020': 32003,
 '2022': 32004,
 '2024': 32005}