In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.functional import one_hot
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
# import datamol as dm
# import rdkit
# from rdkit import Chem
# from rdkit.Chem import rdFingerprintGenerator
# import useful_rdkit_utils as uru
import sys
from matplotlib import pyplot as plt

print(f"Pandas version used: {pd.__version__}")
print(f"PyTorch version used: {torch.__version__}")
print(f"NumPy version used: {np.__version__}")
#print(f"RDKit version used: {rdkit.__version__}")
print(f"Python version used: {sys.version}")

Pandas version used: 2.2.3
PyTorch version used: 2.2.2
NumPy version used: 1.26.4
Python version used: 3.12.7 (main, Oct 16 2024, 09:10:10) [Clang 18.1.8 ]


In [2]:
# PyTorch example re. saving & reloading tensors
t = torch.tensor([1., 2.])
torch.save(t, 'tensor.pt')
ts = torch.load('tensor.pt')
ts

tensor([1., 2.])

In [3]:
# Load adrs tensors from 2_ADR_regressor.ipynb after it's saved (from 2_ADR_regressor_save_tensors.ipynb)
adrs_ts = torch.load("adr_train_tensors.pt")
adrs_ts

tensor([[-1.5256, -0.7502],
        [-0.6540, -1.6095],
        [-0.1002, -0.6092],
        ...,
        [ 0.8748,  0.9873],
        [-0.7102,  2.8641],
        [ 1.1651,  2.0154]], requires_grad=True)

In [4]:
# Plan
# If wanting to use tokenizer.decode(), likely may need to build a tokenization/tokenizer model first...?

# consider trying HuggingFace's transformers:
# 1. Set up tokenizer model that will tokenize the ADRs/words
# 2. Apply tokenizer.decode() function to each tensor row/sequence (via using list comprehension)
# 3. Use sample code snippet below to decode tensors: 
# decoded = [tokenizer.decode(x) for x in adrs_ts]
# the code will iterate through each row/sequence of tensors and apply the decode() method 
# which'll transform the numerical IDs back into human-readable texts/words

# other methods that may be useful:
# convert_ids_to_tokens - converts numerical IDs back into corresponding token identifiers
# convert_tokens_to_string - merges sub-word tokens into complete words if needed


# may need to also use/integrate code with tokenizers package? (https://pypi.org/project/tokenizers/)

In [5]:
## Sample normalizers code to "normalise" texts
# somehow the normalizer code is not quite working yet... text data in and the same text data out...

# from tokenizers.models import BPE, WordLevel, WordPiece
# from tokenizers import Tokenizer, normalizers
# from tokenizers.normalizers import StripAccents, Sequence, Replace

# BPE - byte pair encoding
# bpe_tokenizer = Tokenizer(BPE())
# print(bpe_tokenizer.normalizer)
# bpe_tokenizer.normalizer = normalizers.Sequence([StripAccents()])
## normalizer seems to be set already even though code seems not right within the normalizers.Sequence() (?)
# print(bpe_tokenizer.normalizer)

# sentences = ['abdominal_pain', 'Höw aRę ŸõŪ dÔįñg?']

# normalized_sentences = [bpe_tokenizer.normalizer.normalize_str(s) for s in sentences]
# normalized_sentences

In [6]:
# example text data from one of CYP3A4 substrates - bosenten's ADRs 
# since ADRs data are preprocessed a bit more than raw texts found elsewhere, decided to go straight to create a tokenizer
data = ["abnormal_LFT^^, headache^^, RTI^^, hemoglobin_decreased^^, sperm_count_decreased^^, edema^^, hepatic_cirrhosis(pm), liver_failure(pm), jaundice(pm), syncope^, sinusitis^, nasal_congestion^, sinus_congestion^, rhinitis^, oropharyngeal_pain^, epistaxis^, nasopharyngitis^, idiopathic_pulmonary_fibrosis^, anemia^, hematocrit_decreased^, thrombocytopenia(pm), neutropenia(pm), leukopenia(pm), flushing^, hypotension^, palpitation^, orthostatic_hypotension^, unstable_angina^, hot_flush^, gastroesophageal_reflux_disease^, diarrhea^, pruritus^, erythema^, angioedema(pm), DRESS(pm), rash(pm), dermatitis(pm), arthralgia^, joint_swelling^, blurred_vision^, chest_pain^, peripheral_edema^, influenza_like_illness^, vertigo^, fever^, chest_pain^, hypersensitivity_reaction^, anaphylaxis(pm)"]

from tokenizers.models import WordLevel
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, decoders, trainers

# have not yet taken into account of unknown words or padding token
tokenizer = Tokenizer(models.WordLevel())

trainer = trainers.WordLevelTrainer(vocab_size=100000)

# training tokenizer 
# specify iterator - pass through iterator a sequence of sequences in the data via using map() function to apply split()
# and trainer
tokenizer.train_from_iterator(map(lambda x: x.split(), data), trainer=trainer)

tokenizer.get_vocab()
# returns the indices of each token in the text data

{'thrombocytopenia(pm),': 44,
 'arthralgia^,': 7,
 'influenza_like_illness^,': 25,
 'nasal_congestion^,': 30,
 'oropharyngeal_pain^,': 33,
 'epistaxis^,': 12,
 'erythema^,': 13,
 'RTI^^,': 2,
 'abnormal_LFT^^,': 3,
 'fever^,': 14,
 'leukopenia(pm),': 28,
 'rhinitis^,': 39,
 'diarrhea^,': 10,
 'anemia^,': 5,
 'hypersensitivity_reaction^,': 22,
 'hematocrit_decreased^,': 18,
 'orthostatic_hypotension^,': 34,
 'sinus_congestion^,': 40,
 'pruritus^,': 37,
 'dermatitis(pm),': 9,
 'DRESS(pm),': 1,
 'blurred_vision^,': 8,
 'nasopharyngitis^,': 31,
 'hemoglobin_decreased^^,': 19,
 'neutropenia(pm),': 32,
 'rash(pm),': 38,
 'anaphylaxis(pm)': 4,
 'sinusitis^,': 41,
 'gastroesophageal_reflux_disease^,': 16,
 'angioedema(pm),': 6,
 'palpitation^,': 35,
 'hepatic_cirrhosis(pm),': 20,
 'liver_failure(pm),': 29,
 'jaundice(pm),': 26,
 'sperm_count_decreased^^,': 42,
 'chest_pain^,': 0,
 'syncope^,': 43,
 'headache^^,': 17,
 'joint_swelling^,': 27,
 'vertigo^,': 46,
 'hypotension^,': 23,
 'peripheral

In [7]:
# using str.split() but punctuations such as commas are not stripped/splitted
for t in data:
    print(t.split())

['abnormal_LFT^^,', 'headache^^,', 'RTI^^,', 'hemoglobin_decreased^^,', 'sperm_count_decreased^^,', 'edema^^,', 'hepatic_cirrhosis(pm),', 'liver_failure(pm),', 'jaundice(pm),', 'syncope^,', 'sinusitis^,', 'nasal_congestion^,', 'sinus_congestion^,', 'rhinitis^,', 'oropharyngeal_pain^,', 'epistaxis^,', 'nasopharyngitis^,', 'idiopathic_pulmonary_fibrosis^,', 'anemia^,', 'hematocrit_decreased^,', 'thrombocytopenia(pm),', 'neutropenia(pm),', 'leukopenia(pm),', 'flushing^,', 'hypotension^,', 'palpitation^,', 'orthostatic_hypotension^,', 'unstable_angina^,', 'hot_flush^,', 'gastroesophageal_reflux_disease^,', 'diarrhea^,', 'pruritus^,', 'erythema^,', 'angioedema(pm),', 'DRESS(pm),', 'rash(pm),', 'dermatitis(pm),', 'arthralgia^,', 'joint_swelling^,', 'blurred_vision^,', 'chest_pain^,', 'peripheral_edema^,', 'influenza_like_illness^,', 'vertigo^,', 'fever^,', 'chest_pain^,', 'hypersensitivity_reaction^,', 'anaphylaxis(pm)']


In [8]:
# using pre_tokenizer will split at white spaces and remove punctuations, and set tokens for each word and each punctuation
pre_tokenizer = pre_tokenizers.Whitespace()
split_data = [pre_tokenizer.pre_tokenize_str(t) for t in data]
split_data

[[('abnormal_LFT', (0, 12)),
  ('^^,', (12, 15)),
  ('headache', (16, 24)),
  ('^^,', (24, 27)),
  ('RTI', (28, 31)),
  ('^^,', (31, 34)),
  ('hemoglobin_decreased', (35, 55)),
  ('^^,', (55, 58)),
  ('sperm_count_decreased', (59, 80)),
  ('^^,', (80, 83)),
  ('edema', (84, 89)),
  ('^^,', (89, 92)),
  ('hepatic_cirrhosis', (93, 110)),
  ('(', (110, 111)),
  ('pm', (111, 113)),
  ('),', (113, 115)),
  ('liver_failure', (116, 129)),
  ('(', (129, 130)),
  ('pm', (130, 132)),
  ('),', (132, 134)),
  ('jaundice', (135, 143)),
  ('(', (143, 144)),
  ('pm', (144, 146)),
  ('),', (146, 148)),
  ('syncope', (149, 156)),
  ('^,', (156, 158)),
  ('sinusitis', (159, 168)),
  ('^,', (168, 170)),
  ('nasal_congestion', (171, 187)),
  ('^,', (187, 189)),
  ('sinus_congestion', (190, 206)),
  ('^,', (206, 208)),
  ('rhinitis', (209, 217)),
  ('^,', (217, 219)),
  ('oropharyngeal_pain', (220, 238)),
  ('^,', (238, 240)),
  ('epistaxis', (241, 250)),
  ('^,', (250, 252)),
  ('nasopharyngitis', (253, 2

In [9]:
for i in range(10):
    print(f'ID: {i}, token: {tokenizer.id_to_token(i)}')

ID: 0, token: chest_pain^,
ID: 1, token: DRESS(pm),
ID: 2, token: RTI^^,
ID: 3, token: abnormal_LFT^^,
ID: 4, token: anaphylaxis(pm)
ID: 5, token: anemia^,
ID: 6, token: angioedema(pm),
ID: 7, token: arthralgia^,
ID: 8, token: blurred_vision^,
ID: 9, token: dermatitis(pm),


In [10]:
# number of unique tokens (words)
tokenizer.get_vocab_size()

47

In [11]:
# hypothetically speaking and making adrs_ts as X_train -> trying to set up PyTorch's DataLoader
X_train = adrs_ts
X_train

tensor([[-1.5256, -0.7502],
        [-0.6540, -1.6095],
        [-0.1002, -0.6092],
        ...,
        [ 0.8748,  0.9873],
        [-0.7102,  2.8641],
        [ 1.1651,  2.0154]], requires_grad=True)

In [None]:
## load cyp3a4 substrates adrs in texts
import pickle
with open("cyp3a4_adrs", "rb") as adrs:
    cyp3a4_subs_adrs = pickle.load(adrs)

In [14]:
cyp3a4_subs_adrs

['constipation^^, leucopenia^^, dizziness^^, sedation^^, ataxia^^, elevated_GGT^^, allergic_skin_reactions^^, eosinophilia^, thrombocytopenia^, neutropenia^, headache^, tremor^, elevated_ALP^, pruritus^, paresthesia^, diplopia^, blurred_vision^, hyponatremia^, fluid_retention^, oedema^, weight_gain^, reduced_plasma_osmolarity_(ADH_like_effect)^, vertigo^',
 'dizziness^^, sedation^^, fatigue^, vertigo^, accidental_injury^, insomnia^, anxiety^, dry_mouth^, constipation^, metrorrhagia^, abdominal_pain^, anaphylaxis(pm), angioedema(pm), pruritus(pm), urticaria(pm)',
 'rash^^, diarrhea^^, abdominal_pain^^, constipation^^, dyspepsia^^, hemorrhage^^, neutropenia^^, thrombocytopenia^^, anemia^^, influenza^^, weight_gain^^, muscle_spasm/cramps^^, musculoskeletal_pain^^, joint_pain^^, myalgia^^, bone_pain^^, headache^^, dizziness^^, periorbital_edema^^, edema^^, fatigue^^, fever^^, insomnia^^, depression^^, nasopharyngitis^^, cough^^, upper_respiratory_tract infection^^, pharyngolaryngeal_pain^^

In [None]:
# Enable padding
# Also set up a padding token
pad_token = "[pad]"
tokenizer.enable_padding(pad_id=tokenizer.token_to_id(pad_token))