<a href="https://colab.research.google.com/github/jperez-1010/bert2sage_data/blob/main/DRKG_mapping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

DRKG

Adapted from: https://github.com/gnn4dr/DRKG/blob/master/drkg_with_dgl/loading_drkg_in_dgl.ipynb

In [None]:
# https://www.dgl.ai/pages/start.html
!pip install  dgl -f https://data.dgl.ai/wheels/repo.html
!pip install  dglgo -f https://data.dgl.ai/wheels-test/repo.html
!pip install  dglke
!pip install tabulate
#!git clone    https://github.com/gnn4dr/DRKG.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels/repo.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.dgl.ai/wheels-test/repo.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Cloning into 'DRKG'...
remote: Enumerating objects: 296, done.[K
remote: Counting objects: 100% (8/8), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 296 (delta 5), reused 2 (delta 2), pack-reused 288[K
Receiving objects: 100% (296/296), 19.56 MiB | 19.49 MiB/s, done.
Resolving deltas: 100% (146/146), done.


In [None]:
### Mount Drive ###
from google.colab import drive
drive.mount('/content/drive')

### Import Statments ###

import dgl
import csv
import sys
import torch
import tabulate
import pandas as pd
import numpy as np
import os 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Download and unzip files

In [None]:
# DRKG
!wget https://dgl-data.s3-us-west-2.amazonaws.com/dataset/DRKG/drkg.tar.gz
!tar -xvf  'drkg.tar.gz' 

# Hetionet mapping
!wget https://raw.githubusercontent.com/hetio/hetionet/main/hetnet/tsv/hetionet-v1.0-nodes.tsv

# MeSH terms
!wget https://raw.githubusercontent.com/dhimmel/mesh/gh-pages/data/terms.tsv

--2023-04-29 22:42:30--  https://dgl-data.s3-us-west-2.amazonaws.com/dataset/DRKG/drkg.tar.gz
Resolving dgl-data.s3-us-west-2.amazonaws.com (dgl-data.s3-us-west-2.amazonaws.com)... 52.218.222.1, 52.218.184.105, 52.218.250.129, ...
Connecting to dgl-data.s3-us-west-2.amazonaws.com (dgl-data.s3-us-west-2.amazonaws.com)|52.218.222.1|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 216650245 (207M) [application/x-tar]
Saving to: ‘drkg.tar.gz.2’


2023-04-29 22:42:42 (18.0 MB/s) - ‘drkg.tar.gz.2’ saved [216650245/216650245]

._drkg.tsv
drkg.tsv
._embed
embed/
embed/DRKG_TransE_l2_relation.npy
embed/._relations.tsv
embed/relations.tsv
embed/._entities.tsv
embed/entities.tsv
embed/Readme.md
embed/mol_edgepred.npy
embed/mol_infomax.npy
embed/mol_masking.npy
embed/mol_contextpred.npy
embed/DRKG_TransE_l2_entity.npy
._entity2src.tsv
entity2src.tsv
._relation_glossary.tsv
relation_glossary.tsv
--2023-04-29 22:42:47--  https://raw.githubusercontent.com/hetio/hetionet/main/

In [None]:
######## Read Functions  ##################
def print_head(df:pd.core.frame.DataFrame,n:int=5) -> None:
  print(tabulate.tabulate(df.head(n) , headers='keys', tablefmt='psql'))

  
def get_triplets(drkg_file:str = 'drkg.tsv',verbose:bool=False) -> list:
  """ Read drkg.tsv file and return triplets """

  df        = pd.read_csv(drkg_file, sep="\t", header=None, engine="pyarrow")
  triplets  = df.values.tolist()

  if verbose: 

    print("\n Triplets:\n")
    print(triplets[0:10])

    print(f"\n {drkg_file}  Dataframe:\n")
    print_head(df)
  return triplets,df




def  read_tsv(relation_file:str,verbose:bool=False):
  """ Read glossary """ 
  df = pd.read_csv(relation_file, sep="\t",engine="pyarrow")

  if verbose:
    print(f"\n {relation_file}  Dataframe:\n")
    print_head(df)
  return df





########## Filter & Map Functions ###################
def filter_drkg(data_frame:pd.core.frame.DataFrame, filter_column:int,filter_term:str,verbose:bool=False) -> pd.core.frame.DataFrame:
  """
    Arguments:
      filter_column<int>: column use to filter 
      fitler_term<str>:   string (use for Regex) capturing  either the interaction to filter or head/tail  e.g.: r'.*?Compound:Disease'
 
    Outputs: 
      df:<pd.core.frame.DataFrame> A filter dataframe 
  """
  relations           = pd.Series(data_frame[filter_column].unique())
  relations_filtered  = relations[relations.str.contains(filter_term, regex=True)]

  if verbose: 
    print(f"Number of Rows Before Filtering: {len(relations)}")
    print(f"Number of Rows After Filtering: {len(relations_filtered )}\n")
    print("\nFiltered:")
    print(relations_filtered ) 

  return relations_filtered 




def map_drkg_relationships(df_1,relation_glossary,verbose:bool=False):
  df_1_ = df_1.to_frame().merge( relation_glossary, left_on=0, right_on='Relation-name', how='left')

  if verbose:
    print("\nRelationships Mapped:")
    print(df_1_['Interaction-type'])
  
  return  df_1_



def filter_interaction_subset(df:pd.core.frame.DataFrame,filter_colunm_name:str,regex_string:str,return_colunm_name:str=None) -> pd.core.frame.DataFrame:
    """
    Arguments:
      df:<pd.core.frame.DataFrame>:    DataFrame to filter 
      filter_colunm_name<str>:         Dataframe column use to do the filtering 
      regex_string<str>:               Regular Expression use to filter e.g. 'treat|inhibit|alleviate
      return_colunm_name<str:optional> Optional Name of the filtered column to return, if None it returns the dataframe 


      filter_column<int>: column use to filter 
      fitler_term<str>:   string (use for Regex) capturing  either the interaction to filter or head/tail  e.g.: r'.*?Compound:Disease'
 
    Outputs: 
      df:<pd.core.frame.DataFrame> A filter dataframe 
  """

    subset = df[df[filter_colunm_name].str.contains(regex_string, regex=True)]  # Filter dataframe based on regex

    ### Return a specific column if user requests it ###
    if return_colunm_name != None: 
        subset  =  subset[return_colunm_name]


    return subset



def get_unique_values(df, colunm:int) :
  """  Check if any entries are null or contain :: """
  df0_test = np.unique(df[colunm][df[colunm].str.contains("::")].to_numpy())
  return df0_test




# 1) Load Data

In [None]:
### 1) Read: This section reads DRKG and a glossary (used to map entitites from codes to words)
verbose            =  True 
triplets,drkg_df   =  get_triplets(verbose=verbose)                         # Read triplets (head,relationship,tail)
relation_glossary  =  read_tsv('relation_glossary.tsv',verbose=verbose)     # Read relationship mapping  


### 2) Filter & Map Interactions: This section returns a list of interactions e.g. DRUGBANK::treats::Compound:Disease )
# 2.1: First  we filter the interactions to only Compound-Disease
# 2.2: Then   we map the codes -> text  (this will be use to further filter interactions based on text) e.g.  Hetionet::CpD::Compound:Disease -> palliation
# 2.3: We use natural text to fitler  interactions based on terms such as "treat" (but we return the orignal interaction name )


drkg_rx_dx_relations        = filter_drkg(data_frame = drkg_df ,  filter_column = 1 ,  filter_term = r'.*?Compound:Disease', verbose = verbose) # 2.1 Filter only Compound-Disease Interactions
drkg_rx_dx_relations_mapped = map_drkg_relationships(drkg_rx_dx_relations,relation_glossary,verbose=verbose)                                    # 2.2 Map codes to text 

### 2.3 Filter Drug interactions Interaction types to onyl include: treat inhibit or alleviate interactions  ###
drkg_rx_dx_relation_subset =  filter_interaction_subset(df                  = drkg_rx_dx_relations_mapped,
                                                        filter_colunm_name = 'Interaction-type' ,
                                                        regex_string       =  'treat|inhibit|alleviate',
                                                        return_colunm_name =  'Relation-name')

# 3) Use Filter Interactions to get Gilter DRKG 
drkg_df_filtered = drkg_df[drkg_df[1].isin(drkg_rx_dx_relation_subset)] # 3.1 Filter DRKG  to only  Compund-Disease 
print_head(df=drkg_df_filtered)
rx_dx_triplets   = drkg_df_filtered.values.tolist()                     # 3.2 Convert  Fitlterh DRKG to list



# 4) Load Data frames for translation
hetionet_df   =  read_tsv('hetionet-v1.0-nodes.tsv',verbose=verbose)    # Read relationship mapping  
mesh_df       =  read_tsv( 'terms.tsv',verbose=verbose)                 # Read MeSH (Medical-Subjects Heading ) file (Disease and Drugs codes )


 Triplets:

[['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2157'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5264'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2158'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::3309'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::28912'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::811'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::2159'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::821'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5627'], ['Gene::2157', 'bioarx::HumGenHumGen:Gene:Gene', 'Gene::5624']]

 drkg.tsv  Dataframe:

+----+------------+--------------------------------+-------------+
|    | 0          | 1                              | 2           |
|----+------------+--------------------------------+-------------|
|  0 | Gene::2157 | bioarx::HumGenHumGen:Gene:Gene | Gene::2157  |
|  1 | Gene::2157 | bioarx::HumGenHumGen:Gene:Gene | Ge

In [None]:
# Make dictionaries
relation_glossary_relation_dict = pd.Series(relation_glossary['Interaction-type'].values, index=relation_glossary['Relation-name']).to_dict()

node_df   = pd.concat([hetionet_df[['name', 'id']], mesh_df.rename(columns = {"mesh_name":"name", "mesh_id":"id"})], ignore_index=True, axis=0)
node_dict = pd.Series(node_df['name'].values, index=node_df['id']).to_dict()

# in hetionet data - ony missing C562840 for Breast Cancer, Familial and C562839 for Malignant Mesothelioma
# hetionet_drkg3[hetionet_drkg3['id'].isnull() & hetionet_drkg3['mesh_id'].isnull()].drop_duplicates(subset=['id', 'MeSH'])

df_med    = drkg_df.copy()
df_med[0] = df_med[0].str.replace(r'.*?MESH:', "", regex=True)
df_med[2] = df_med[2].str.replace(r'.*?MESH:', "", regex=True)

df_med[1] = df_med[1].map(relation_glossary_relation_dict).fillna(df_med[1])

df_med[0] = df_med[0].map(node_dict).fillna(df_med[0])
df_med[2] = df_med[2].map(node_dict).fillna(df_med[2])
df_med # @@@ Alejandro - Word mapping of some DRKG nodes/relations. See below for "rx_dx_triplets_med" which is the subset of drug-treats-disease relations


Unnamed: 0,0,1,2
0,F8,interaction,F8
1,F8,interaction,PHYH
2,F8,interaction,F9
3,F8,interaction,HSPA5
4,F8,interaction,Gene::28912
...,...,...,...
5874256,COMMD9,reaction,DDB2
5874257,PPIL1,reaction,HNRNPC
5874258,CBFB,catalysis,CDK1
5874259,CES1,binding,UGT2B10


# 2) Load Pre-train Model

In [None]:
def load_drugd(filename:str,colunm_names:list =['drug','ids'],verbose:bool=False)->list:
  """ Reads a .csv file containing two colunms Compound::DB00605 and id  
     Arguments:
     filename<str>:                    File to read
     colunm_names<list[str]: optional> Colun names of files  (if not provided the file is expected to have to colunms with drugs,id)

     Output:
      drug_list<list>: list of compunds 
  """

  drug_list      = []
  with open(filename, newline='', encoding='utf-8') as csvfile:
      reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['drug','ids'])
      for row_val in reader:
          drug_list.append(row_val['drug'])
      
      if verbose: 
        print(f"Load file {filename}  with  {len(drug_list)}\n")
        print(f"Example, first 5 entries:\n {drug_list[0:5]}\n")

  return drug_list





### 1) Read file of Drgus 
drug_file_name = os.path.join("DRKG","drug_repurpose","infer_drug.tsv")
drug_list      = load_drugd(filename=drug_file_name ,verbose=True)

Load file DRKG/drug_repurpose/infer_drug.tsv  with  8104

Example, first 5 entries:
 ['Compound::DB00605', 'Compound::DB00983', 'Compound::DB01240', 'Compound::DB11755', 'Compound::DB12184']



In [None]:
entity_idmap_file   = 'embed/entities.tsv'
relation_idmap_file = 'embed/relations.tsv'



# Get drugname/disease name to entity ID mappings
entity_map = {}
entity_id_map = {}
relation_map = {}
with open(entity_idmap_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['name','id'])
    for row_val in reader:
        entity_map[row_val['name']] = int(row_val['id'])
        entity_id_map[int(row_val['id'])] = row_val['name']
        
with open(relation_idmap_file, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t', fieldnames=['name','id'])
    for row_val in reader:
        relation_map[row_val['name']] = int(row_val['id'])
        
# handle the ID mapping
drug_ids = []
disease_ids = []
for drug in drug_list:
    drug_ids.append(entity_map[drug])
    
for disease in COV_disease_list:
    disease_ids.append(entity_map[disease])

treatment_rid = [relation_map[treat]  for treat in treatment]

NameError: ignored

## Checks 

In [None]:
df0_test    = get_unique_values(df_med, colunm=0) 
df2_test    = get_unique_values(df_med, colunm=2) 
df_0_2_test = np.unique(np.append(df0_test,df2_test))
print(len(df0_test))
print(len(df2_test))
print(len(df_0_2_test))
df_0_2_test 


28143
27837
39454


array(['Atc::A', 'Atc::A01', 'Atc::A01A', ..., 'Tax::9940', 'Tax::99802',
       'Tax::9986'], dtype=object)

In [None]:
# Check if Word DRKG has all drug-treats-disease relationships in word form 
print(df.shape[0])
print(df_med.shape[0])

rx_dx_df_med       = df_med[df[1].isin(relation_subset)]
rx_dx_triplets_med = rx_dx_df_med.values.tolist()
rx_dx_triplets_med

5874261
5874261


In [None]:

rx0_test = get_unique_values(rx_dx_df_med, colunm=0) 
rx2_test = get_unique_values(rx_dx_df_med, colunm=2) 
rx_remaining_terms = np.unique(np.append(rx0_test,rx2_test))

In [None]:
rx_remaining_terms

array(['Compound::CHEBI:128458', 'Compound::CHEBI:12936',
       'Compound::CHEBI:136604', ..., 'Disease::OMIM:613290',
       'Disease::OMIM:613508', 'Disease::OMIM:613729'], dtype=object)