## DRKG embedding preprocessing
#### Author: Jianfeng Ke
#### Email: jianfeng_ke@student.uml.edu

### DRKG embeddings preprocessing

In [26]:
import os

# Check if the file already exists
if not os.path.exists('drkg.tar.gz'):
    print("DRKG files not found, downloading...")
    os.system(f"wget https://dgl-data.s3-us-west-2.amazonaws.com/dataset/DRKG/drkg.tar.gz")
else:
    print("DRKG files already downloaded.")

# Unzip the file if it's already downloaded
if not os.path.exists('embed'):
    print("Unzipping drkg.tar.gz...")
    os.system(f"tar -xzvf drkg.tar.gz")
    print("Finished!")
else:
    print("drkg.tar.gz is already unzipped")

DRKG files already downloaded.
drkg.tar.gz is already unzipped


In [27]:
# this is the directory where you download and unzip the drkg.tar.gz
drkg_dir = '/project/pi_rachel_melamed_uml_edu/Jianfeng/Drug_combinations/06122025/drkg_embedding_preprocessing/'
# this is the directory where you save your preprocessing data from clinical trials
ct_dir = '/project/pi_rachel_melamed_uml_edu/Panos/drug_combo_jianfeng/CT_20250605/'
# this is your current working directory
work_dir = '/project/pi_rachel_melamed_uml_edu/Jianfeng/Drug_combinations/06122025/drkg_embedding_preprocessing/'

In [28]:
import numpy as np
import pandas as pd

# table of 1 drug and 1 condition
drug_cond_df = pd.read_csv(ct_dir + 'drug_condition_20250605.txt', delimiter='\t')
# Drug combinations info
comb_df = pd.read_csv(ct_dir + 'dcombinations_w_conditions_20250605.txt', delimiter='\t')
ct_drug = np.unique(drug_cond_df['drug']).tolist()
ct_cond = np.unique(drug_cond_df['condition']).tolist()
print(f'There are {len(ct_drug)} drugs and {len(ct_cond)} conditions in {drug_cond_df.shape[0]} clinical trials.')

# Load the embeddings and the entity names
embeddings = np.load(drkg_dir+'embed/DRKG_TransE_l2_entity.npy')
entity_name = pd.read_csv(drkg_dir+'embed/entities.tsv', sep='\t', names=['entity', 'id'])
print(f'There are {embeddings.shape[0]} entities in total in DRKG, and the embedding size is {embeddings.shape[1]}.')

There are 1942 drugs and 3363 conditions in 105608 clinical trials.
There are 97238 entities in total in DRKG, and the embedding size is 400.


In [29]:
entity_list = list(entity_name['entity'])

# dictionary for drugs with the embedding indices
drug_dict = {}
for index, value in enumerate(entity_list):
    if "Compound" in value:
        value = value.split("::")[1]
        if value in ct_drug:
            drug_dict[value] = index
# dictionary for conditions with the embedding indices
cond_dict = {}
for index, value in enumerate(entity_list):
    if "Disease" in value:
        value = value.split("::")[1]
        if value.startswith("MESH"):
            value = value.split(":")[1]
            if value in ct_cond:
                cond_dict[value] = index
# Sort the dictionary by keys
drug_dict = {key: drug_dict[key] for key in sorted(drug_dict)}
cond_dict = {key: cond_dict[key] for key in sorted(cond_dict)}

# print info
print(f'DRKG has pretrained embeddings for:')
print(f'  - {len(drug_dict.keys())}/{len(ct_drug)} drugs;')
print(f'  - {len(cond_dict.keys())}/{len(ct_cond)} conditions;')

# extract the embeddings
drug_embedding = pd.DataFrame(embeddings[list(drug_dict.values())], index=drug_dict.keys())
cond_embedding = pd.DataFrame(embeddings[list(cond_dict.values())], index=cond_dict.keys())
# Set the index name to "id"
drug_embedding = drug_embedding.rename_axis('id')
cond_embedding = cond_embedding.rename_axis('id')
# save the matrix
drug_embedding.to_csv(work_dir+'drkg_filtered_drug_embedding.csv')
cond_embedding.to_csv(work_dir+'drkg_filtered_condition_embedding.csv')
print('Finished saving DRKG embeddings for drugs and conditions!')

DRKG has pretrained embeddings for:
  - 1832/1942 drugs;
  - 2684/3363 conditions;
Finished saving DRKG embeddings for drugs and conditions!
