The following is a short sample demonstration about how to use [tokenizers](https://pypi.org/project/tokenizers/) package to tokenise a very small set of ADR terms (or words) into tokens, then encode ADR terms with token IDs, followed by a final decoding of these token IDs back into the corresponding ADR terms.

In [None]:
#from tokenizers.models import WordLevel
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers
import sys, datetime
print(f"Python version used: {sys.version} at {datetime.datetime.now()}")

Python version used: 3.12.7 (main, Oct 16 2024, 09:10:10) [Clang 18.1.8 ] at 2025-05-15 16:24:27.156234


In [2]:
## Sample normalizers code to "normalise" texts
# somehow the normalizer code is not quite working yet... text data in and the same text data out...

# from tokenizers.models import BPE, WordLevel, WordPiece
# from tokenizers import Tokenizer, normalizers
# from tokenizers.normalizers import StripAccents, Sequence, Replace

# BPE - byte pair encoding
# bpe_tokenizer = Tokenizer(BPE())
# print(bpe_tokenizer.normalizer)
# bpe_tokenizer.normalizer = normalizers.Sequence([StripAccents()])
## normalizer seems to be set already even though code seems not right within the normalizers.Sequence() (?)
# print(bpe_tokenizer.normalizer)

# sentences = ['abdominal_pain', 'Höw aRę ŸõŪ dÔįñg?']

# normalized_sentences = [bpe_tokenizer.normalizer.normalize_str(s) for s in sentences]
# normalized_sentences

In [3]:
# example text data from one of CYP3A4 substrates - bosenten's ADRs 
# since ADRs data are preprocessed a bit more than raw texts found elsewhere, decided to go straight to create a tokenizer
data = ["abnormal_LFT^^, headache^^, RTI^^, hemoglobin_decreased^^, sperm_count_decreased^^, edema^^, hepatic_cirrhosis(pm), liver_failure(pm), jaundice(pm), syncope^, sinusitis^, nasal_congestion^, sinus_congestion^, rhinitis^, oropharyngeal_pain^, epistaxis^, nasopharyngitis^, idiopathic_pulmonary_fibrosis^, anemia^, hematocrit_decreased^, thrombocytopenia(pm), neutropenia(pm), leukopenia(pm), flushing^, hypotension^, palpitation^, orthostatic_hypotension^, unstable_angina^, hot_flush^, gastroesophageal_reflux_disease^, diarrhea^, pruritus^, erythema^, angioedema(pm), DRESS(pm), rash(pm), dermatitis(pm), arthralgia^, joint_swelling^, blurred_vision^, chest_pain^, peripheral_edema^, influenza_like_illness^, vertigo^, fever^, chest_pain^, hypersensitivity_reaction^, anaphylaxis(pm)"]

#UNK_TOKEN = '[UNK]'
PAD_TOKEN = '[PAD]'

# have not yet taken into account of unknown words or padding token
tokenizer = Tokenizer(models.WordLevel())

# below link explains about how to add special tokens e.g. unknown tokens to take into account diff. scenarios
# https://huggingface.co/learn/llm-course/chapter6/8?fw=pt#building-a-wordpiece-tokenizer-from-scratch
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordLevelTrainer(vocab_size=100000, special_tokens=special_tokens)

# training tokenizer 
# specify iterator - pass through iterator a sequence of sequences in the data via using map() function to apply split()
# and trainer
tokenizer.train_from_iterator(map(lambda x: x.split(), data), trainer=trainer)

tokenizer.get_vocab()
# returns the indices of each token in the text data

{'hot_flush^,': 26,
 'gastroesophageal_reflux_disease^,': 21,
 'fever^,': 19,
 'joint_swelling^,': 32,
 'anemia^,': 10,
 '[CLS]': 2,
 '[MASK]': 4,
 'anaphylaxis(pm)': 9,
 'oropharyngeal_pain^,': 38,
 'rhinitis^,': 44,
 'RTI^^,': 7,
 'hepatic_cirrhosis(pm),': 25,
 'neutropenia(pm),': 37,
 'peripheral_edema^,': 41,
 'headache^^,': 22,
 'angioedema(pm),': 11,
 'thrombocytopenia(pm),': 49,
 'pruritus^,': 42,
 'hypersensitivity_reaction^,': 27,
 'nasopharyngitis^,': 36,
 'diarrhea^,': 15,
 '[PAD]': 1,
 'hypotension^,': 28,
 'orthostatic_hypotension^,': 39,
 'idiopathic_pulmonary_fibrosis^,': 29,
 'dermatitis(pm),': 14,
 'hemoglobin_decreased^^,': 24,
 'influenza_like_illness^,': 30,
 'flushing^,': 20,
 '[UNK]': 0,
 'hematocrit_decreased^,': 23,
 'erythema^,': 18,
 'abnormal_LFT^^,': 8,
 'DRESS(pm),': 6,
 'liver_failure(pm),': 34,
 'chest_pain^,': 5,
 'sinusitis^,': 46,
 'unstable_angina^,': 50,
 'nasal_congestion^,': 35,
 'leukopenia(pm),': 33,
 'syncope^,': 48,
 'vertigo^,': 51,
 'sinus_co

In [4]:
# using str.split() but punctuations such as commas are not stripped/splitted
for t in data:
    print(t.split())

['abnormal_LFT^^,', 'headache^^,', 'RTI^^,', 'hemoglobin_decreased^^,', 'sperm_count_decreased^^,', 'edema^^,', 'hepatic_cirrhosis(pm),', 'liver_failure(pm),', 'jaundice(pm),', 'syncope^,', 'sinusitis^,', 'nasal_congestion^,', 'sinus_congestion^,', 'rhinitis^,', 'oropharyngeal_pain^,', 'epistaxis^,', 'nasopharyngitis^,', 'idiopathic_pulmonary_fibrosis^,', 'anemia^,', 'hematocrit_decreased^,', 'thrombocytopenia(pm),', 'neutropenia(pm),', 'leukopenia(pm),', 'flushing^,', 'hypotension^,', 'palpitation^,', 'orthostatic_hypotension^,', 'unstable_angina^,', 'hot_flush^,', 'gastroesophageal_reflux_disease^,', 'diarrhea^,', 'pruritus^,', 'erythema^,', 'angioedema(pm),', 'DRESS(pm),', 'rash(pm),', 'dermatitis(pm),', 'arthralgia^,', 'joint_swelling^,', 'blurred_vision^,', 'chest_pain^,', 'peripheral_edema^,', 'influenza_like_illness^,', 'vertigo^,', 'fever^,', 'chest_pain^,', 'hypersensitivity_reaction^,', 'anaphylaxis(pm)']


In [5]:
# using pre_tokenizer will split at white spaces and remove punctuations, and set tokens for each word and each punctuation
pre_tokenizer = pre_tokenizers.Whitespace()
split_data = [pre_tokenizer.pre_tokenize_str(t) for t in data]
split_data

[[('abnormal_LFT', (0, 12)),
  ('^^,', (12, 15)),
  ('headache', (16, 24)),
  ('^^,', (24, 27)),
  ('RTI', (28, 31)),
  ('^^,', (31, 34)),
  ('hemoglobin_decreased', (35, 55)),
  ('^^,', (55, 58)),
  ('sperm_count_decreased', (59, 80)),
  ('^^,', (80, 83)),
  ('edema', (84, 89)),
  ('^^,', (89, 92)),
  ('hepatic_cirrhosis', (93, 110)),
  ('(', (110, 111)),
  ('pm', (111, 113)),
  ('),', (113, 115)),
  ('liver_failure', (116, 129)),
  ('(', (129, 130)),
  ('pm', (130, 132)),
  ('),', (132, 134)),
  ('jaundice', (135, 143)),
  ('(', (143, 144)),
  ('pm', (144, 146)),
  ('),', (146, 148)),
  ('syncope', (149, 156)),
  ('^,', (156, 158)),
  ('sinusitis', (159, 168)),
  ('^,', (168, 170)),
  ('nasal_congestion', (171, 187)),
  ('^,', (187, 189)),
  ('sinus_congestion', (190, 206)),
  ('^,', (206, 208)),
  ('rhinitis', (209, 217)),
  ('^,', (217, 219)),
  ('oropharyngeal_pain', (220, 238)),
  ('^,', (238, 240)),
  ('epistaxis', (241, 250)),
  ('^,', (250, 252)),
  ('nasopharyngitis', (253, 2

In [6]:
for i in range(10):
    print(f'ID: {i}, token: {tokenizer.id_to_token(i)}')

ID: 0, token: [UNK]
ID: 1, token: [PAD]
ID: 2, token: [CLS]
ID: 3, token: [SEP]
ID: 4, token: [MASK]
ID: 5, token: chest_pain^,
ID: 6, token: DRESS(pm),
ID: 7, token: RTI^^,
ID: 8, token: abnormal_LFT^^,
ID: 9, token: anaphylaxis(pm)


In [7]:
# number of unique tokens (words)
tokenizer.get_vocab_size()

52

In [8]:
# Enable padding
# need to find out if pad_id is always necessary e.g. pad_id = tokenizer.token_to_id(PAD_TOKEN)
tokenizer.enable_padding(pad_token=PAD_TOKEN)

In [9]:
output = tokenizer.encode('vertigo^,', 'chest_pain^,')
print(output.ids)

[51, 5]


In [10]:
tokenizer.decode([51, 5])

'vertigo^, chest_pain^,'