# Clinical Trial Embedding Tutorial

In this tutorial I will show you how to obtain clinical trial information and use embeddings for different types of clinical trial data.

Agenda:
- Collect all clinical trial records from clinicaltrials.gov
- Read and parse the obtained XML files 
- Embed **disease indications** using 'nlpie/tiny-biobert', a compact version of BioBERT
- Embed clinical trial **inclusion-/exclusion criteria** using 'nlpie/tiny-biobert', a compact version of BioBERT
- Embed **sponsor information** using 'all-MiniLM-L6-v2', a powerful pre-trained sentence encoder
- Convert **drug names** to their SMILES representation and then to their Morgan fingerprint

Let's start!

# Import libraries

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from functools import reduce

# Collect all the clinical trial records

I suggest running the whole process in the command line since it is time- and space consuming. 

### 1. Download data
mkdir -p raw_data \
cd raw_data \
wget https://clinicaltrials.gov/AllPublicXML.zip # This will take 10-20 minutes to download

### 2. Unzip the ZIP file.
### The unzipped file occupies approximately 11 GB. Please make sure you have enough space. 
unzip AllPublicXML.zip # This might take over an hour to run, depending on your system \
cd ../

### 3. Collect and sort all the XML files
find raw_data/ -name NCT*.xml | sort > data/all_xml \
head -3 data/all_xml

### Output:
raw_data/NCT0000xxxx/NCT00000102.xml \
raw_data/NCT0000xxxx/NCT00000104.xml \
raw_data/NCT0000xxxx/NCT00000105.xml 

NCTID is the identifier of a clinical trial. `NCT00000102`, `NCT00000104`, `NCT00000105` are all NCTIDs. 

### 4. Remove ZIP file to recover some disk space
rm raw_data/AllPublicXML.zip

# Parse XML clinical trial files

In [9]:
from xml.etree import ElementTree as ET
# function adapted from https://github.com/futianfan/clinical-trial-outcome-prediction
def xmlfile2results(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    nctid = root.find('id_info').find('nct_id').text	### nctid: 'NCT00000102'
    # print("nctid is", nctid)
    study_type = root.find('study_type').text
    # print("study type is", study_type)
    interventions = [i for i in root.findall('intervention')]
    drug_interventions = [i.find('intervention_name').text for i in interventions \
														if i.find('intervention_type').text=='Drug']
    # print("drug intervention:", drug_interventions)
    ### remove 'biologics', 
    ### non-interventions 
    if len(drug_interventions)==0:
        return (None,)

    try:
        status = root.find('overall_status').text 
        # print("status:", status)
    except:
        status = ''

    try:
        why_stop = root.find('why_stopped').text
        # print("why stop:", why_stop)
    except:
        why_stop = ''

    try:
        phase = root.find('phase').text
        # print("phase:", phase)
    except:
        phase = ''
    conditions = [i.text for i in root.findall('condition')] ### disease 
    # print("disease", conditions)

    try:
        criteria = root.find('eligibility').find('criteria').find('textblock').text
        # print('found criteria')
    except:
        criteria = ''

    try:
        enrollment = root.find('enrollment').text
        # print("enrollment:", enrollment)
    except:
        enrollment = ''

    try:
        lead_sponsor = root.find('sponsors').find('lead_sponsor').find('agency').text 
        # print("lead_sponsor:", lead_sponsor)
    except:
        lead_sponsor = ''

    data = {'nctid':nctid,
           'study_type':study_type,
           'drug_interventions':[drug_interventions],
           'overall_status':status,
           'why_stopped':why_stop,
           'phase':phase,
           'indications':[conditions],
           'criteria':criteria,
           'enrollment':enrollment,
           'lead_sponsor':lead_sponsor}
    return pd.DataFrame(data)
    
xmlfile = "data/NCT00040014.xml"
df = xmlfile2results(xmlfile)
df.head()

Unnamed: 0,nctid,study_type,drug_interventions,overall_status,why_stopped,phase,indications,criteria,enrollment,lead_sponsor
0,NCT00040014,Interventional,[exemestane],Terminated,,Phase 2,[Breast Neoplasms],\n Inclusion Criteria:\r\n\r\n ...,100,Pfizer


In [None]:
# We will only use a limited selection of trials, the same as the HINT paper (https://www.cell.com/patterns/pdf/S2666-3899(22)00018-6.pdf)
# This way, we can later on compare performaces of the clinical trial outcome prediction
df_selected = pd.read_pickle('data/selected_trials_df.pkl')
toy_nctids = df_selected[df_selected['dataset']=='toy']['nctid'].tolist()
toy_df = pd.DataFrame()

#Parse the XML file for each selected trial and save resulting dataframe
for nctid in tqdm(toy_nctids):
    try:
        xml_file = 'raw_data/'+nctid[:7]+'xxxx/'+nctid+'.xml'
        df = xmlfile2results(xml_file)
        toy_df = pd.concat([toy_df, df], axis=0)  
    except FileNotFoundError:
        print(f"The file {file} does not exist.")
        continue

toy_df = toy_df.merge(df_selected[df_selected['dataset']=='toy'], on='nctid', how='left')
pickle.dump(toy_df, open('data/toy_df.pkl', 'wb'))  
toy_df.head()

# Using sentence-transformers to embed information - Example

In [None]:
!pip install -U sentence-transformers

In [32]:
from sentence_transformers import SentenceTransformer

In [1]:
sentences = ["This is an example sentence", "Each sentence is converted"]
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(embeddings.shape)
print(embeddings)

No sentence-transformers model found with name C:\Users\Lennart/.cache\torch\sentence_transformers\nlpie_tiny-biobert. Creating a new one with MEAN pooling.
Some weights of BertModel were not initialized from the model checkpoint at C:\Users\Lennart/.cache\torch\sentence_transformers\nlpie_tiny-biobert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(2, 312)


# Indication Embedding 
### Create indication2embedding_dict using nlpie/tiny-biobert

In [56]:
def create_indication2embedding_dict():
    # Import toy dataset
    toy_df = pd.read_pickle('data/toy_df.pkl')

    # Create list with all indications and encode each one into a 312-dimensional vector
    all_indications = sorted(set(reduce(lambda x, y: x + y, toy_df['indications'].tolist())))     

    # Using 'nlpie/tiny-biobert', a smaller version of BioBERT
    model = SentenceTransformer('nlpie/tiny-biobert')
    embeddings = model.encode(all_indications, show_progress_bar=True)

    # Create dictionary mapping indications to embeddings
    indication2embedding_dict = {}
    for key, row in zip(all_indications, embeddings):
        indication2embedding_dict[key] = row
    pickle.dump(indication2embedding_dict, open('data/indication2embedding_dict.pkl', 'wb')) 
        
    embedding = []
    for indication_lst in tqdm(toy_df['indications'].tolist()):
        vec = []
        for indication in indication_lst:
            vec.append(indication2embedding_dict[indication])
        print(np.array(vec).shape) # DEBUG
        vec = np.mean(np.array(vec), axis=0)
        print(vec.shape) # DEBUG
        embedding.append(vec)
    print(np.array(embedding).shape)
    
    dict = zip(toy_df['nctid'], np.array(embedding))
    nctid2disease_embedding_dict = {}
    for key, row in zip(toy_df['nctid'], np.array(embedding)):
        nctid2disease_embedding_dict[key] = row
    pickle.dump(nctid2disease_embedding_dict, open('data/nctid2disease_embedding_dict.pkl', 'wb'))  
    
create_indication2embedding_dict()

No sentence-transformers model found with name C:\Users\Lennart/.cache\torch\sentence_transformers\nlpie_tiny-biobert. Creating a new one with MEAN pooling.
Some weights of BertModel were not initialized from the model checkpoint at C:\Users\Lennart/.cache\torch\sentence_transformers\nlpie_tiny-biobert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Batches:   0%|          | 0/43 [00:00<?, ?it/s]

 57%|███████████████████████████████████████████▌                                 | 831/1469 [00:00<00:00, 3910.09it/s]

(2, 312)
(312,)
(3, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(3, 312)
(312,)
(9, 312)
(312,)
(5, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(8, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(2, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(1, 312)
(312,)
(5, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(2, 312)
(312,)
(2, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(5, 312)
(312,)
(2, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(4, 312)

100%|████████████████████████████████████████████████████████████████████████████| 1469/1469 [00:00<00:00, 3675.90it/s]

(1, 312)
(312,)
(5, 312)
(312,)
(3, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(4, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(5, 312)
(312,)
(1, 312)
(312,)
(4, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(3, 312)
(312,)
(1, 312)
(312,)
(2, 312)
(312,)
(2, 312)

(1469, 312)





# Sponsor Embedding 
### Create sponsor2embedding_dict using all-MiniLM-L6-v2

In [14]:
def create_sponsor2embedding_dict():
    # Import toy dataset
    toy_df = pd.read_pickle('data/toy_df.pkl')

    # Create list with all indications and encode each one into a 384-dimensional vector
    all_sponsors = sorted(set(toy_df['lead_sponsor'].tolist()))     

    # Using 'all-MiniLM-L6-v2', a pre-trained model with excellent performance and speed
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(all_sponsors, show_progress_bar=True)
    print(embeddings.shape)

    # Create dictionary mapping indications to embeddings
    sponsor2embedding_dict = {}
    for key, row in zip(all_sponsors, embeddings):
        sponsor2embedding_dict[key] = row
    pickle.dump(sponsor2embedding_dict, open('data/sponsor2embedding_dict.pkl', 'wb'))
    
create_sponsor2embedding_dict()

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

(459, 384)


# Protocol Embedding

In [16]:
# Helper functions to clean up protocols from https://github.com/futianfan/clinical-trial-outcome-prediction/blob/main/HINT/protocol_encode.py
def clean_protocol(protocol):
    protocol = protocol.lower()
    protocol_split = protocol.split('\n')
    filter_out_empty_fn = lambda x: len(x.strip())>0
    strip_fn = lambda x:x.strip()
    protocol_split = list(filter(filter_out_empty_fn, protocol_split))
    protocol_split = list(map(strip_fn, protocol_split))
    return protocol_split 

def split_protocol(protocol):
    protocol_split = clean_protocol(protocol)
    inclusion_idx, exclusion_idx = len(protocol_split), len(protocol_split)
    for idx, sentence in enumerate(protocol_split):
        if "inclusion" in sentence:
            inclusion_idx = idx
            break
    for idx, sentence in enumerate(protocol_split):
        if "exclusion" in sentence:
            exclusion_idx = idx 
            break 		
    if inclusion_idx + 1 < exclusion_idx + 1 < len(protocol_split):
        inclusion_criteria = protocol_split[inclusion_idx:exclusion_idx]
        exclusion_criteria = protocol_split[exclusion_idx:]
        if not (len(inclusion_criteria) > 0 and len(exclusion_criteria) > 0):
            print(len(inclusion_criteria), len(exclusion_criteria), len(protocol_split))
            exit()
        return inclusion_criteria, exclusion_criteria ## list, list 
    else:
        return protocol_split, 

In [25]:
# Example of clean-up functions
# Import toy dataset
toy_df = pd.read_pickle('data/toy_df.pkl')
# split_protocol() cleans and splits web-scraped criteria into lists of inclusion and exclusion criteria
split_protocol(toy_df['criteria'][0])

(['inclusion criteria:',
  '-',
  'patients must have:',
  'unipolar major depression (per diagnostic and statistical manuel-iv criteria) with or',
  'without melancholia.'],
 ['exclusion criteria:',
  '-',
  'patients with the following symptoms or conditions are excluded:',
  'psychotic or atypical subtype of unipolar major depression.'])

### Create nctid2protocol_embedding_dict using nlpie/tiny-biobert

In [None]:
def create_nctid2protocol_embedding_dict():
     # Import toy dataset
    toy_df = pd.read_pickle('data/toy_df.pkl')
    
    # Using 'nlpie/tiny-biobert', a smaller version of BioBERT
    model = SentenceTransformer('nlpie/tiny-biobert')
    
    def criteria2vec(criteria):
        embeddings = model.encode(criteria)
#         print(embeddings.shape) # DEBUG
        embeddings_avg = np.mean(embeddings, axis=0)
#         print(embeddings_avg.shape) # DEBUG
        return embeddings_avg
    
    nctid_2_protocol_embedding = dict()
    print(f"Embedding {len(toy_df)*2} inclusion/exclusion criteria..")
    for nctid, protocol in tqdm(zip(toy_df['nctid'].tolist(), toy_df['criteria'].tolist())):    
#         if(nctid == 'NCT00003567'): break #DEBUG
        split = split_protocol(protocol)
        if len(split)==2:
            embedding = np.concatenate((criteria2vec(split[0]), criteria2vec(split[1])))
        else: 
            embedding = np.concatenate((criteria2vec(split[0]), np.zeros(312)))
        nctid_2_protocol_embedding[nctid] = embedding
#         for key in nctid_2_protocol_embedding: #DEBUG
#             print(f"{key}:{nctid_2_protocol_embedding[key].shape}") #DEBUG
    pickle.dump(nctid_2_protocol_embedding, open('data/nctid_2_protocol_embedding_dict.pkl', 'wb'))   
    return 

create_nctid2protocol_embedding_dict()

# Drug molecule embedding
### Converting drug names to their SMILES representation

In [4]:
import requests

def get_smiles(drug_name):
    # URL for the CIR API
    base_url = "https://cactus.nci.nih.gov/chemical/structure"
    url = f"{base_url}/{drug_name}/smiles"
    
    try:
        # Send a GET request to retrieve the SMILES representation
        response = requests.get(url)
    
        if response.status_code == 200:
            smiles = response.text.strip()  # Get the SMILES string
            print(f"Drug Name: {drug_name}")
            print(f"SMILES: {smiles}")
        else:
            print(f"Failed to retrieve SMILES for {drug_name}. Status code: {response.status_code}")
            smiles = ''
    
    except requests.exceptions.RequestException as e:
        print(f"An error occurred: {e}")

    return smiles

# Define the drug name you want to convert
drug_name = "aspirin"  # Replace with the drug name of your choice
get_smiles(drug_name)

Drug Name: aspirin
SMILES: CC(=O)Oc1ccccc1C(O)=O


'CC(=O)Oc1ccccc1C(O)=O'

### Create drug2smiles_dict

In [None]:
import pandas as pd
from functools import reduce

# Import toy dataset
toy_df = pd.read_pickle('data/toy_df.pkl')

# Create list with all drugs and encode each one into its SMILES representation
all_drugs = sorted(set(reduce(lambda x, y: x + y, toy_df['drug_interventions'].tolist())))     

# Create dictionary mapping indications to embeddings
drug2smiles_dict = {}
for drug in all_drugs:
    drug2smiles_dict[drug] = get_smiles(drug)
pickle.dump(drug2smiles_dict, open('data/drug2smiles_dict.pkl', 'wb')) 

### Converting SMILES to Morgan Fingerprint

In [None]:
!pip install DeepPurpose

In [None]:
from DeepPurpose.utils import encode_drug 
import pandas as pd

# Example list of SMILES strings representing drug molecules
smiles_list = pd.DataFrame(['O=C(C)Oc1ccccc1C(=O)O', 'CC(CC1=CC=CC=C1)C(=O)O', 'CN1CCN(CC1)C2=C(C=CC(=C2)OC)OC'], columns=['SMILES'])

# Encode the drug molecules
drug_encodings = encode_drug(smiles_list, drug_encoding='Morgan', column_name = 'SMILES', save_column_name = 'drug_encoding')

# Print the encoded representations
for x in drug_encodings['drug_encoding']:
    print(x.shape)

drug_encodings.head()

In [None]:
# Helper function to clean up protocols from https://github.com/futianfan/clinical-trial-outcome-prediction/blob/main/HINT/protocol_encode.py
def txt_to_lst(text):
    """
        "['CN[C@H]1CC[C@@H](C2=CC(Cl)=C(Cl)C=C2)C2=CC=CC=C12', 'CNCCC=C1C2=CC=CC=C2CCC2=CC=CC=C12']" 
    """
    text = text[1:-1]
    lst = [i.strip()[1:-1] for i in text.split(',')]
    return lst 

def create_smiles2morgan_dict():
    from DeepPurpose.utils import smiles2morgan 

    # Import toy dataset
    toy_df = pd.read_csv('data/toy_df.csv')
        
    smiles_lst = list(map(txt_to_lst, toy_df['smiless'].tolist()))
    unique_smiles = set(reduce(lambda x, y: x + y, smiles_lst))
    
    morgan = pd.Series(list(unique_smiles)).apply(smiles2morgan)
    smiles2morgan_dict = dict(zip(unique_smiles, morgan))
    pickle.dump(smiles2morgan_dict, open('data/smiles2morgan_dict.pkl', 'wb')) 

create_smiles2morgan_dict()

def load_smiles2morgan_dict():
    with open('data/smiles2morgan_dict.pkl', 'rb') as pickle_file:
        return pickle.load(pickle_file)

### Create nctid2molecule_embedding_dict

In [None]:
import numpy as np
from tqdm import tqdm

def create_nctid2molecule_embedding_dict():
    # Import toy dataset
    toy_df = pd.read_csv('data/toy_df.csv')
    smiles_lst = list(map(txt_to_lst, toy_df['smiless'].tolist()))
    smiles2morgan_dict = load_smiles2morgan_dict()
    
    embedding = []
    for drugs in tqdm(smiles_lst):
        vec = []
        for drug in drugs:
            vec.append(smiles2morgan_dict[drug])
        # print(np.array(vec).shape) # DEBUG
        vec = np.mean(np.array(vec), axis=0)
        # print(vec.shape) # DEBUG
        embedding.append(vec)
    print(np.array(embedding).shape)
    
    dict = zip(toy_df['nctid'], np.array(embedding))
    nctid2molecule_embedding_dict = {}
    for key, row in zip(toy_df['nctid'], np.array(embedding)):
        nctid2molecule_embedding_dict[key] = row
    pickle.dump(nctid2molecule_embedding_dict, open('data/nctid2molecule_embedding_dict.pkl', 'wb'))  

create_nctid2molecule_embedding_dict()