*this notebook uses a venv created by using uv*
- https://docs.astral.sh/uv/guides/integration/jupyter/#using-jupyter-from-vs-code

In [1]:
import pandas as pd
print(f"Pandas version used is: {pd.__version__}")
import torch
print(f"PyTorch version used is: {torch.__version__}")
import torch.nn as nn
import numpy as np
print(f"NumPy version used is: {np.__version__}")
import datamol as dm
import rdkit
print(f"RDKit version used is: {rdkit.__version__}")
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
import useful_rdkit_utils as uru

Pandas version used is: 2.2.3
PyTorch version used is: 2.2.2
NumPy version used is: 1.26.4
RDKit version used is: 2024.03.6


In [2]:
# this is an extremely small set of data compiled manually via references as stated in the dataframe
# it may not lead to a very significant result but it is done as an example of what an early DNN model will look like
# ideally there should be more training data features added 

data = pd.read_csv("All_CYP3A4_substrates")
data.head()

Unnamed: 0,generic_drug_name,notes,cyp_strength_of_evidence,drug_class,adverse_drug_reactions,first_ref,second_ref,date_checked
0,carbamazepine,,strong,antiepileptics,"constipation^^, leucopenia^^, dizziness^^, som...",drugs.com,nzf,211024
1,eliglustat,,strong,metabolic_agents,"diarrhea^^, oropharyngeal_pain^^, arthralgia^^...",drugs.com,emc,151124
2,flibanserin,,strong,CNS_agents,"dizziness^^, somnolence^^, sedation^, fatigue^...",drugs.com,Drugs@FDA,161124
3,imatinib,,strong,tyrosine_kinase_inhibitor,"rash^^, diarrhea^^, abdominal_pain^^, constipa...",drugs.com,nzf,181124
4,ibrutinib,,strong,tyrosine_kinase_inhibitor,"hypertension^^, atrial_fibrillation^^, sinus_t...",drugs.com,nzf,191124


For drug with astericks marked in "notes" column, see data notes under "Exceptions for ADRs" section in 1_ADR_data.qmd.

In [3]:
# drop some columns
df = data.drop([
    "notes",
    "first_ref", 
    "second_ref", 
    "date_checked"
    ], axis=1)
df

Unnamed: 0,generic_drug_name,cyp_strength_of_evidence,drug_class,adverse_drug_reactions
0,carbamazepine,strong,antiepileptics,"constipation^^, leucopenia^^, dizziness^^, som..."
1,eliglustat,strong,metabolic_agents,"diarrhea^^, oropharyngeal_pain^^, arthralgia^^..."
2,flibanserin,strong,CNS_agents,"dizziness^^, somnolence^^, sedation^, fatigue^..."
3,imatinib,strong,tyrosine_kinase_inhibitor,"rash^^, diarrhea^^, abdominal_pain^^, constipa..."
4,ibrutinib,strong,tyrosine_kinase_inhibitor,"hypertension^^, atrial_fibrillation^^, sinus_t..."
5,neratinib,strong,tyrosine_kinase_inhibitor,"diarrhea^^, abdominal_pain^^, stomatitis^^, dy..."
6,esomeprazole,strong,proton_pump_inhibitors,"headache^^, flatulence^^, dizziness^, somnolen..."
7,omeprazole,strong,proton_pump_inhibitors,"fever^^, otitis_media^^, respiratory_system_re..."
8,ivacaftor,strong,CFTR_potentiator,"rash^^, oropharyngeal_pain^^, abdominal_pain^^..."
9,naloxegol,strong,peripheral_opioid_receptor_antagonists,"abdominal pain^^, possible_opioid_withdrawal_s..."


In [4]:
string = df["generic_drug_name"].tolist()
# Convert list of drugs into multiple strings of drug names
drugs = f"'{"','".join(string)}'"
# Convert from lower case to upper case
for letter in drugs:
    if letter.islower():
        drugs = drugs.replace(letter, letter.upper())
print(drugs)

'CARBAMAZEPINE','ELIGLUSTAT','FLIBANSERIN','IMATINIB','IBRUTINIB','NERATINIB','ESOMEPRAZOLE','OMEPRAZOLE','IVACAFTOR','NALOXEGOL','OXYCODONE','SIROLIMUS','TERFENADINE','DIAZEPAM','HYDROCORTISONE','LANSOPRAZOLE','PANTOPRAZOLE','LERCANIDIPINE','NALDEMEDINE','NELFINAVIR','TELAPREVIR','ONDANSETRON','QUININE','RIBOCICLIB','SUVOREXANT','TELITHROMYCIN','TEMSIROLIMUS'


In [5]:
# Get SMILES for each drug (via copying-and-pasting the previous cell output - attempted various ways to feed the string
# directly into cyp_drugs.py, current way seems to be the most straightforward one...)
from cyp_drugs import chembl_drugs
# Using ChEMBL version 34
df_3a4 = chembl_drugs(
    'CARBAMAZEPINE','ELIGLUSTAT','FLIBANSERIN','IMATINIB','IBRUTINIB','NERATINIB','ESOMEPRAZOLE','OMEPRAZOLE','IVACAFTOR','NALOXEGOL','OXYCODONE','SIROLIMUS','TERFENADINE','DIAZEPAM','HYDROCORTISONE','LANSOPRAZOLE','PANTOPRAZOLE','LERCANIDIPINE','NALDEMEDINE','NELFINAVIR','TELAPREVIR','ONDANSETRON','QUININE','RIBOCICLIB','SUVOREXANT','TELITHROMYCIN','TEMSIROLIMUS', 
    #file_name="All_cyp3a4_smiles"
    )
print(df_3a4.shape)
df_3a4.head()

## Note: latest ChEMBL version 35 (as from 1st Dec 2024) seems to be taking a long time to load (no output after ~7min), 
## both versions 33 & 34 are ok with outputs loading within a few secs

(27, 4)


Unnamed: 0,chembl_id,pref_name,max_phase,canonical_smiles
0,CHEMBL108,CARBAMAZEPINE,4,NC(=O)N1c2ccccc2C=Cc2ccccc21
1,CHEMBL12,DIAZEPAM,4,CN1C(=O)CN=C(c2ccccc2)c2cc(Cl)ccc21
2,CHEMBL2110588,ELIGLUSTAT,4,CCCCCCCC(=O)N[C@H](CN1CCCC1)[C@H](O)c1ccc2c(c1...
3,CHEMBL1201320,ESOMEPRAZOLE,4,COc1ccc2[nH]c([S@@+]([O-])Cc3ncc(C)c(OC)c3C)nc2c1
4,CHEMBL231068,FLIBANSERIN,4,O=c1[nH]c2ccccc2n1CCN1CCN(c2cccc(C(F)(F)F)c2)CC1


In [6]:
# Rename column & change lower to uppercase
df = df.rename(columns={"generic_drug_name": "pref_name"})
df["pref_name"] = df["pref_name"].str.upper()
# Merge df & df_3a4 
df = df.merge(df_3a4, how="left", on="pref_name")
df.head(10)

Unnamed: 0,pref_name,cyp_strength_of_evidence,drug_class,adverse_drug_reactions,chembl_id,max_phase,canonical_smiles
0,CARBAMAZEPINE,strong,antiepileptics,"constipation^^, leucopenia^^, dizziness^^, som...",CHEMBL108,4,NC(=O)N1c2ccccc2C=Cc2ccccc21
1,ELIGLUSTAT,strong,metabolic_agents,"diarrhea^^, oropharyngeal_pain^^, arthralgia^^...",CHEMBL2110588,4,CCCCCCCC(=O)N[C@H](CN1CCCC1)[C@H](O)c1ccc2c(c1...
2,FLIBANSERIN,strong,CNS_agents,"dizziness^^, somnolence^^, sedation^, fatigue^...",CHEMBL231068,4,O=c1[nH]c2ccccc2n1CCN1CCN(c2cccc(C(F)(F)F)c2)CC1
3,IMATINIB,strong,tyrosine_kinase_inhibitor,"rash^^, diarrhea^^, abdominal_pain^^, constipa...",CHEMBL941,4,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...
4,IBRUTINIB,strong,tyrosine_kinase_inhibitor,"hypertension^^, atrial_fibrillation^^, sinus_t...",CHEMBL1873475,4,C=CC(=O)N1CCC[C@@H](n2nc(-c3ccc(Oc4ccccc4)cc3)...
5,NERATINIB,strong,tyrosine_kinase_inhibitor,"diarrhea^^, abdominal_pain^^, stomatitis^^, dy...",CHEMBL180022,4,CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2...
6,ESOMEPRAZOLE,strong,proton_pump_inhibitors,"headache^^, flatulence^^, dizziness^, somnolen...",CHEMBL1201320,4,COc1ccc2[nH]c([S@@+]([O-])Cc3ncc(C)c(OC)c3C)nc2c1
7,OMEPRAZOLE,strong,proton_pump_inhibitors,"fever^^, otitis_media^^, respiratory_system_re...",CHEMBL1503,4,COc1ccc2[nH]c([S+]([O-])Cc3ncc(C)c(OC)c3C)nc2c1
8,IVACAFTOR,strong,CFTR_potentiator,"rash^^, oropharyngeal_pain^^, abdominal_pain^^...",CHEMBL2010601,4,CC(C)(C)c1cc(C(C)(C)C)c(NC(=O)c2c[nH]c3ccccc3c...
9,NALOXEGOL,strong,peripheral_opioid_receptor_antagonists,"abdominal pain^^, possible_opioid_withdrawal_s...",CHEMBL2219418,4,C=CCN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](OCCOCC...


I'm parsing the canonical SMILES through my old script to generate these small molecules as RDKit molecules and standardised SMILES, making sure these SMILES are valid and parsable.

In [7]:
# Using my previous code to preprocess small mols
# disable rdkit messages
dm.disable_rdkit_log()

#  The following function code were adapted from datamol.io
def preprocess(row):

    """
    Function to preprocess, fix, standardise, sanitise compounds 
    and then generate various molecular representations based on these molecules.
    Can be utilised as df.apply(preprocess, axis=1).

    :param smiles_column: SMILES column name (needs to be names as "canonical_smiles") 
    derived from ChEMBL database (or any other sources) via an input dataframe
    :param mol: RDKit molecules
    :return: preprocessed RDKit molecules, standardised SMILES, SELFIES, 
    InChI and InChI keys added as separate columns in the dataframe
    """

    # smiles_column = strings object
    smiles_column = "canonical_smiles"
    # Convert each compound into a RDKit molecule in the smiles column
    mol = dm.to_mol(row[smiles_column], ordered=True)
    # Fix common errors in the molecules
    mol = dm.fix_mol(mol)
    # Sanitise the molecules 
    mol = dm.sanitize_mol(mol, sanifix=True, charge_neutral=False)
    # Standardise the molecules
    mol = dm.standardize_mol(
        mol,
        # Switch on to disconnect metal ions
        disconnect_metals=True,
        normalize=True,
        reionize=True,
        # Switch on "uncharge" to neutralise charges
        uncharge=True,
        # Taking care of stereochemistries of compounds
        # Note: this uses the older approach of "AssignStereochemistry()" from RDKit
        # https://github.com/datamol-io/datamol/blob/main/datamol/mol.py#L488
        stereo=True,
    )

    # Adding following rows of different molecular representations 
    row["rdkit_mol"] = dm.to_mol(mol)
    row["standard_smiles"] = dm.standardize_smiles(dm.to_smiles(mol))
    #row["selfies"] = dm.to_selfies(mol)
    #row["inchi"] = dm.to_inchi(mol)
    #row["inchikey"] = dm.to_inchikey(mol)
    return row

df_p3a4 = df.apply(preprocess, axis = 1)
df_p3a4.head()

Unnamed: 0,pref_name,cyp_strength_of_evidence,drug_class,adverse_drug_reactions,chembl_id,max_phase,canonical_smiles,rdkit_mol,standard_smiles
0,CARBAMAZEPINE,strong,antiepileptics,"constipation^^, leucopenia^^, dizziness^^, som...",CHEMBL108,4,NC(=O)N1c2ccccc2C=Cc2ccccc21,<rdkit.Chem.rdchem.Mol object at 0x13ed7f920>,NC(=O)N1c2ccccc2C=Cc2ccccc21
1,ELIGLUSTAT,strong,metabolic_agents,"diarrhea^^, oropharyngeal_pain^^, arthralgia^^...",CHEMBL2110588,4,CCCCCCCC(=O)N[C@H](CN1CCCC1)[C@H](O)c1ccc2c(c1...,<rdkit.Chem.rdchem.Mol object at 0x13ed7fa00>,CCCCCCCC(=O)N[C@H](CN1CCCC1)[C@H](O)c1ccc2c(c1...
2,FLIBANSERIN,strong,CNS_agents,"dizziness^^, somnolence^^, sedation^, fatigue^...",CHEMBL231068,4,O=c1[nH]c2ccccc2n1CCN1CCN(c2cccc(C(F)(F)F)c2)CC1,<rdkit.Chem.rdchem.Mol object at 0x13ed7fa70>,O=c1[nH]c2ccccc2n1CCN1CCN(c2cccc(C(F)(F)F)c2)CC1
3,IMATINIB,strong,tyrosine_kinase_inhibitor,"rash^^, diarrhea^^, abdominal_pain^^, constipa...",CHEMBL941,4,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,<rdkit.Chem.rdchem.Mol object at 0x13ed7f8b0>,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...
4,IBRUTINIB,strong,tyrosine_kinase_inhibitor,"hypertension^^, atrial_fibrillation^^, sinus_t...",CHEMBL1873475,4,C=CC(=O)N1CCC[C@@H](n2nc(-c3ccc(Oc4ccccc4)cc3)...,<rdkit.Chem.rdchem.Mol object at 0x13ed7f760>,C=CC(=O)N1CCC[C@@H](n2nc(-c3ccc(Oc4ccccc4)cc3)...


In [8]:
## Splitting data 
# random splits usually lead to overly optimistic models... testing molecules are too similar to traininig molecules
# Some blog references re. data splitting wrt small molecules: 
# https://greglandrum.github.io/rdkit-blog/posts/2024-05-31-scaffold-splits-and-murcko-scaffolds1.html
# https://practicalcheminformatics.blogspot.com/2024/11/some-thoughts-on-splitting-chemical.html

## Try using Pat Walters' useful_rdkit_utils' GroupKFoldShuffle 
# (code originated from: https://github.com/scikit-learn/scikit-learn/issues/20520)

# Generate numpy arrays containing the fingerprints 
df_p3a4['fp'] = df_p3a4.rdkit_mol.apply(rdFingerprintGenerator.GetMorganGenerator().GetCountFingerprintAsNumPy)

# Get Butina cluster labels
df_p3a4["butina_cluster"] = uru.get_butina_clusters(df_p3a4.standard_smiles)

# Set up a GroupKFoldShuffle object
group_kfold_shuffle = uru.GroupKFoldShuffle(n_splits=5, shuffle=True)

# Using cross-validation/doing data split
## X = np.stack(df_s3a4.fp), y = df.adverse_drug_reactions, group labels = df_s3a4.butina_cluster
for train, test in group_kfold_shuffle.split(np.stack(df_p3a4.fp), df.adverse_drug_reactions, df_p3a4.butina_cluster):
    print(len(train),len(test))

22 5
22 5
23 4
20 7
21 6


In [9]:
## Figuring out locating train and test sets:

## create a dictionary as {index: butina label} first? --> not this way I think...
## butina cluster labels vs. index
#df_s3a4["butina_cluster"]

## or maybe can directly convert from numpy to tensor --> not this way! 
## will need to locate drugs via indices first to specify training and testing sets
# torch_train = torch.from_numpy(train)
# torch_train
# torch_test = torch.from_numpy(test)
# torch_test

In [10]:
# Locate train & test sets in the original df using pd.iloc 
# Training set indices
train

array([ 0,  1,  3,  4,  6,  7,  8, 11, 12, 13, 15, 16, 17, 19, 20, 21, 22,
       23, 24, 25, 26])

In [11]:
df_p3a4.head(2)

Unnamed: 0,pref_name,cyp_strength_of_evidence,drug_class,adverse_drug_reactions,chembl_id,max_phase,canonical_smiles,rdkit_mol,standard_smiles,fp,butina_cluster
0,CARBAMAZEPINE,strong,antiepileptics,"constipation^^, leucopenia^^, dizziness^^, som...",CHEMBL108,4,NC(=O)N1c2ccccc2C=Cc2ccccc21,<rdkit.Chem.rdchem.Mol object at 0x13ed7f920>,NC(=O)N1c2ccccc2C=Cc2ccccc21,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",20
1,ELIGLUSTAT,strong,metabolic_agents,"diarrhea^^, oropharyngeal_pain^^, arthralgia^^...",CHEMBL2110588,4,CCCCCCCC(=O)N[C@H](CN1CCCC1)[C@H](O)c1ccc2c(c1...,<rdkit.Chem.rdchem.Mol object at 0x13ed7fa00>,CCCCCCCC(=O)N[C@H](CN1CCCC1)[C@H](O)c1ccc2c(c1...,"[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",19


In [12]:
# Convert indices into list
train_set = train.tolist()
# Locate drugs and drug info via pd.DataFrame.iloc
df_train = df_p3a4.iloc[train_set]
print(df_train.shape)
df_train.head(2)

(21, 11)


Unnamed: 0,pref_name,cyp_strength_of_evidence,drug_class,adverse_drug_reactions,chembl_id,max_phase,canonical_smiles,rdkit_mol,standard_smiles,fp,butina_cluster
0,CARBAMAZEPINE,strong,antiepileptics,"constipation^^, leucopenia^^, dizziness^^, som...",CHEMBL108,4,NC(=O)N1c2ccccc2C=Cc2ccccc21,<rdkit.Chem.rdchem.Mol object at 0x13ed7f920>,NC(=O)N1c2ccccc2C=Cc2ccccc21,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",20
1,ELIGLUSTAT,strong,metabolic_agents,"diarrhea^^, oropharyngeal_pain^^, arthralgia^^...",CHEMBL2110588,4,CCCCCCCC(=O)N[C@H](CN1CCCC1)[C@H](O)c1ccc2c(c1...,<rdkit.Chem.rdchem.Mol object at 0x13ed7fa00>,CCCCCCCC(=O)N[C@H](CN1CCCC1)[C@H](O)c1ccc2c(c1...,"[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",19


In [13]:
# Testing set indices
test

array([ 2,  5,  9, 10, 14, 18])

In [14]:
test_set = test.tolist()
df_test = df_p3a4.iloc[test_set]
print(df_test.shape)
df_test.head(3)

(6, 11)


Unnamed: 0,pref_name,cyp_strength_of_evidence,drug_class,adverse_drug_reactions,chembl_id,max_phase,canonical_smiles,rdkit_mol,standard_smiles,fp,butina_cluster
2,FLIBANSERIN,strong,CNS_agents,"dizziness^^, somnolence^^, sedation^, fatigue^...",CHEMBL231068,4,O=c1[nH]c2ccccc2n1CCN1CCN(c2cccc(C(F)(F)F)c2)CC1,<rdkit.Chem.rdchem.Mol object at 0x13ed7fa70>,O=c1[nH]c2ccccc2n1CCN1CCN(c2cccc(C(F)(F)F)c2)CC1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",18
5,NERATINIB,strong,tyrosine_kinase_inhibitor,"diarrhea^^, abdominal_pain^^, stomatitis^^, dy...",CHEMBL180022,4,CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2...,<rdkit.Chem.rdchem.Mol object at 0x13ed7fb50>,CCOc1cc2ncc(C#N)c(Nc3ccc(OCc4ccccn4)c(Cl)c3)c2...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",15
9,NALOXEGOL,strong,peripheral_opioid_receptor_antagonists,"abdominal pain^^, possible_opioid_withdrawal_s...",CHEMBL2219418,4,C=CCN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](OCCOCC...,<rdkit.Chem.rdchem.Mol object at 0x13ed7fed0>,C=CCN1CC[C@]23c4c5ccc(O)c4O[C@H]2[C@@H](OCCOCC...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 0,...",1


In [15]:
## Using Butina clustering/splits to split data - to do this, it requires SMILES in order to generate fingerprints 
## which will be used in X_train, X_test
## I may only use these SMILES to this extent for the current post, but for future posts these SMILES might be utilised more...

In [16]:
df_train.head(2)

Unnamed: 0,pref_name,cyp_strength_of_evidence,drug_class,adverse_drug_reactions,chembl_id,max_phase,canonical_smiles,rdkit_mol,standard_smiles,fp,butina_cluster
0,CARBAMAZEPINE,strong,antiepileptics,"constipation^^, leucopenia^^, dizziness^^, som...",CHEMBL108,4,NC(=O)N1c2ccccc2C=Cc2ccccc21,<rdkit.Chem.rdchem.Mol object at 0x13ed7f920>,NC(=O)N1c2ccccc2C=Cc2ccccc21,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",20
1,ELIGLUSTAT,strong,metabolic_agents,"diarrhea^^, oropharyngeal_pain^^, arthralgia^^...",CHEMBL2110588,4,CCCCCCCC(=O)N[C@H](CN1CCCC1)[C@H](O)c1ccc2c(c1...,<rdkit.Chem.rdchem.Mol object at 0x13ed7fa00>,CCCCCCCC(=O)N[C@H](CN1CCCC1)[C@H](O)c1ccc2c(c1...,"[0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",19


In [17]:
## Separate df_train into X_train & y_train, then separate df_test in X_test & y_test

# Use scikit_learn's train_test_split() on df_train - to get X_train, y_train --> no need for this I think...

## NOTE: this step may be integrated with one-hot encoding and vector embeddings!

Converting X & y variables into one-hot encodings or vector embeddings and also set up X_train, y_train, X_test, y_test

In [18]:
## X_train
# 1. convert "cyp_strength_of_evidence" column into one-hot encoding
from torch.nn.functional import one_hot

# re: pandas setting-with-copy warning
# If using df["column_name"], this'll trigger a warning as shown below:
# A value is trying to be set on a copy of a slice from a DataFrame.
# Try using .loc[row_indexer,col_indexer] = value instead
# ref: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html#copy-on-write-cow
# Enable copy-on-write globally to remove the warning
pd.options.mode.copy_on_write = True

# replace CYP strength as numbers
# a useful thread to solve downcasting issue in pd.DataFrame.replace() - https://github.com/pandas-dev/pandas/issues/57734
with pd.option_context('future.no_silent_downcasting', True):
   df_train["cyp_strength_of_evidence"] = df_train["cyp_strength_of_evidence"].replace({"strong": 1, "mod": 2}).infer_objects()
   df_test["cyp_strength_of_evidence"] = df_test["cyp_strength_of_evidence"].replace({"strong": 1, "mod": 2}).infer_objects()

# Get total number of CYP strengths in df
total_cyp_str_train = len(set(df_train["cyp_strength_of_evidence"]))

# re: PyTorch user warning
# if using df_train["cyp_strength_of_evidence"].values - this leads to non-writable tensors with a warning as shown below:
# UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. 
# This means writing to this tensor will result in undefined behavior. 
# You may want to copy the array to protect its data or make it writable before converting it to a tensor. 
# This type of warning will be suppressed for the rest of this program. 
# (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/utils/tensor_numpy.cpp:212.)
# One solution is to add copy() e.g.
# col_encoded = one_hot(torch.from_numpy(df["column_name"].values.copy()) % total_numbers_in_column)
# Alternatively, convert column into numpy array first, then make the numpy array writeable

cyp_array_train = df_train["cyp_strength_of_evidence"].to_numpy()
cyp_array_train.flags.writeable = True
cyp_str_train_t = one_hot(torch.from_numpy(cyp_array_train) % total_cyp_str_train)
cyp_str_train_t

tensor([[0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [0, 1],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0],
        [1, 0]])

In [19]:
# 2. Convert "adverse_drug_reactions" column into embeddings
## see separate scripts used previously e.g. words_tensors.py 
## or Tensors_for_adrs_interactive.py to show step-by-step conversions from words to tensors

# Save all ADRs from common ADRs column as a list (joining every row of ADRs in place only)
adr_str_train = df_train["adverse_drug_reactions"].tolist()
# Join separate rows of strings into one complete string
adr_string_train = ",".join(adr_str_train)
# Converting all ADRs into Torch tensors using adr_tensors.py
from words_tensors import words_tensors
adr_train_t = words_tensors(adr_string_train)
adr_train_t

tensor([[-1.5256, -0.7502],
        [-0.6540, -1.6095],
        [-0.1002, -0.6092],
        ...,
        [ 1.2052, -1.8156],
        [ 0.4502,  1.2972],
        [ 0.0344,  1.1384]], grad_fn=<EmbeddingBackward0>)

In [20]:
# Convert "fp" column into tensors
fp_train_array = np.stack(df_train["fp"])
# Convert numpy array data type from uint32 to int32
fp_train_array = fp_train_array.astype("int32")
fp_train_t = torch.from_numpy(fp_train_array)
fp_train_t = torch.reshape(fp_train_t, (21504,2)) # origianl fp_train_t.shape = (21, 2048); note: this may change due to shuffle = True at data split step
fp_train_t.shape # tensor.ndim to check tensor dimensions

torch.Size([21504, 2])

In [21]:
adr_train_t.shape

torch.Size([811, 2])

In [22]:
cyp_str_train_t.shape

torch.Size([21, 2])

In [23]:
# Concatenate adr tensors, fingerprint tensors and cyp strength tensors as X_train
X_train = torch.cat([adr_train_t, fp_train_t, cyp_str_train_t], 0).float()
X_train

tensor([[-1.5256, -0.7502],
        [-0.6540, -1.6095],
        [-0.1002, -0.6092],
        ...,
        [ 1.0000,  0.0000],
        [ 1.0000,  0.0000],
        [ 1.0000,  0.0000]], grad_fn=<CatBackward0>)

In [24]:
## X_test
# 1. Convert cyp strength into one-hot encodings
total_cyp_str_test = len(set(df_test["cyp_strength_of_evidence"]))
array_test = df_test["cyp_strength_of_evidence"].to_numpy()
array_test.flags.writeable = True
cyp_str_test_t = one_hot(torch.from_numpy(array_test) % total_cyp_str_test)

# 2. Convert "adverse_drug_reactions" column into embeddings
adr_str_test = df_test["adverse_drug_reactions"].tolist()
adr_string_test = ",".join(adr_str_test)
adr_test_t = words_tensors(adr_string_test)

# 3. Convert "fp" column into tensors
fp_test_array = np.stack(df_test["fp"])
fp_test_array = fp_test_array.astype("int32")
fp_test_t = torch.from_numpy(fp_test_array)
fp_test_t = torch.reshape(fp_test_t, (6144,2)) # original fp_test_t.shape = (4, 2048); note: this may change due to shuffle = True at data split step

# Concatenate adr tensors, drug class tensors and cyp strength tensors as X_test
X_test = torch.cat([cyp_str_test_t, adr_test_t, fp_test_t], 0).float()
X_test

tensor([[0., 1.],
        [0., 1.],
        [0., 1.],
        ...,
        [0., 0.],
        [0., 0.],
        [0., 0.]], grad_fn=<CatBackward0>)

In [25]:
## y_train
# Use drug_class column as target
# Convert "drug_class" column into embeddings 
# total number of drug classes in df = 20 - len(set(df["drug_class"])) - using embeddings instead of one-hot
dc_str_train = df_train["drug_class"].tolist()
dc_string_train = ",".join(dc_str_train)
y_train = words_tensors(dc_string_train)
y_train

tensor([[ 0.6791, -0.4543]], grad_fn=<EmbeddingBackward0>)

In [26]:
## y_test
# Convert "drug_class" column into embeddings 
dc_str_test = df_test["drug_class"].tolist()
dc_string_test = ",".join(dc_str_test)
y_test = words_tensors(dc_string_test)
y_test

tensor([[-0.2149, -1.8579]], grad_fn=<EmbeddingBackward0>)

In [27]:
#from torch.utils.data import TensorDataset, DataLoader

## Create a PyTorch dataset (reference code below)
# training_data = TensorDataset(X_train, y_train)
# torch.manual_seed(1)
# batch_size = 2

## Create a dataset loader - DataLoader (reference code below)
# train_dataloader = DataLoader(training_data, batch_size, shuffle = True)

In [28]:
## Set up a DNN regression model 

In [29]:
# May need to set up a class with a few different functions (possibly in separate .py scripts then run in notebook first)

* Structure-adverse drug reaction relationships: 
**ADRs <-> (dense vectors of real numbers) <-> 2D drug structures**

* Structure-activity relationships: 
**drug activities <-> 2d drug structures**

1. First post is to build a basic DNN regression model initially to predict therapeutic drug classes of drugs via using CYP strength, molecule fingerprints and ADRs to infer possible drugs vs. ADRs relationships

2. 2D drug structures part (much further down the line as separate posts; this may evolve into multiple posts in other relevant aspects...)
- graph neural networks (GNN - other variations also available): molecules as undirected graphs where the connections between nodes (atoms) and edges (bonds) don't matter (i.e. don't need to be in particular orders or sequences) 
OR 
- RNN that uses SMILES (NLP technique) -> tokenize SMILES strings -> converts into a dictionary mapping tokens to indices in the vocabulary -> converts the vocabulary (SMILES strings) into one-hot encodings