# OWLNETS-UMLS-GRAPH

## Adds OWLNETS output files content to existing UMLS-Graph-Extracts

### Setup

In [1]:
import sys
import pandas as pd 
import numpy as np
import base64

pd.set_option('display.max_colwidth', None)

### Ingest OWLNETS output files

In [2]:
node_metadata = pd.read_csv("OWLNETS_node_metadata.txt", sep='\t')
node_metadata.tail()

Unnamed: 0,node_id,node_namespace,node_label,node_synonyms,node_dbxrefs
20295,http://purl.obolibrary.org/obo/UBERON_0013140,UBERON,systemic vein,systemic venous tree organ part,umls:c0447117|fma:66644|ncit:c33719|sctid:244389004
20296,http://purl.obolibrary.org/obo/UBERON_4100013,UBERON,postcoracoid,,
20297,http://purl.obolibrary.org/obo/CL_0000980,CL,plasmablast,"cd20-negative b cell|cd27-positive, cd38-positive, cd20-negative b cell",fma:84371
20298,http://purl.obolibrary.org/obo/PR_000001879,PR,leukosialin,,
20299,http://purl.obolibrary.org/obo/UBERON_3000344,UBERON,orbitonasal foramen,,


In [3]:
relations = pd.read_csv("OWLNETS_relations.txt", sep='\t')
relations.tail()

Unnamed: 0,relation_id,relation_namespace,relation_label
124,http://purl.obolibrary.org/obo/RO_0002373,RO,has muscle insertion
125,http://purl.obolibrary.org/obo/BSPO_0015101,BSPO,in dorsal side of
126,http://purl.obolibrary.org/obo/uberon/core#protects,UBERON,protects
127,http://purl.obolibrary.org/obo/RO_0002230,RO,ends with
128,http://purl.obolibrary.org/obo/RO_0001025,RO,located in


In [4]:
edgelist = pd.read_csv("OWLNETS_edgelist.txt", sep='\t')
edgelist.tail()

Unnamed: 0,subject,predicate,object
62856,http://purl.obolibrary.org/obo/UBERON_0007182,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/UBERON_0003984
62857,http://purl.obolibrary.org/obo/UBERON_0016919,http://purl.obolibrary.org/obo/BFO_0000050,http://purl.obolibrary.org/obo/UBERON_0001835
62858,http://purl.obolibrary.org/obo/UBERON_0000963,http://www.w3.org/2000/01/rdf-schema#subClassOf,http://purl.obolibrary.org/obo/UBERON_0002536
62859,http://purl.obolibrary.org/obo/UBERON_0013656,http://purl.obolibrary.org/obo/RO_0001025,http://purl.obolibrary.org/obo/UBERON_0000167
62860,http://purl.obolibrary.org/obo/UBERON_0008860,http://www.w3.org/2000/01/rdf-schema#subClassOf,http://purl.obolibrary.org/obo/UBERON_0000325


### Put relation_label in edgelist, convert subClassOf to isa, convert_, CodeID, and add inverse_ edges

In [5]:
edgelist = edgelist.merge(relations, how='inner', left_on='predicate', right_on='relation_id')
edgelist = edgelist[['subject','relation_label','object']]
del relations
edgelist.loc[(edgelist.relation_label == 'subClassOf'),'relation_label'] = 'isa'
edgelist['relation_label'] = edgelist['relation_label'].str.replace(' ', '_')
edgelist['subject'] = edgelist['subject'].str.replace('_', ' ').str.split('/').str[-1]
edgelist['object'] = edgelist['object'].str.replace('_', ' ').str.split('/').str[-1]

def codeReplacements(x):
   return str(x).replace('NCIT', 'NCI').replace('MESH', 'MSH').replace('GO ', 'GO GO:').replace('SNOMED', 'SNOMEDCT_US')

edgelist['subject'] = edgelist['subject'].apply(codeReplacements)
edgelist['object'] = edgelist['object'].apply(codeReplacements)

edgelist.tail()

Unnamed: 0,subject,relation_label,object
62856,UBERON 0035636,immediately_posterior_to,UBERON 0013771
62857,UBERON 0008259,aboral_to,UBERON 0008258
62858,UBERON 0000104,ends_with,UBERON 0000071
62859,UBERON 0001616,directly_develops_from,UBERON 0003118
62860,UBERON 0035635,immediately_anterior_to,UBERON 0013771


In [6]:
inverse_edgelist = edgelist.copy()
inverse_edgelist['relation_label'] = 'inverse_' + inverse_edgelist['relation_label']
inverse_edgelist.columns = ['object','relation_label','subject']
inverse_edgelist = inverse_edgelist[['subject','relation_label','object']]
edgelist = pd.concat([edgelist, inverse_edgelist], axis=0)
del inverse_edgelist
edgelist.reset_index(drop=True, inplace= True)

edgelist.tail()

Unnamed: 0,subject,relation_label,object
125717,UBERON 0013771,inverse_immediately_posterior_to,UBERON 0035636
125718,UBERON 0008258,inverse_aboral_to,UBERON 0008259
125719,UBERON 0000071,inverse_ends_with,UBERON 0000104
125720,UBERON 0003118,inverse_directly_develops_from,UBERON 0001616
125721,UBERON 0013771,inverse_immediately_anterior_to,UBERON 0035635


### Clean up node_metadata

In [7]:
# Replacements
node_metadata.loc[(node_metadata.node_synonyms == 'None'),'node_synonyms'] = np.NaN
node_metadata['node_dbxrefs'] = node_metadata['node_dbxrefs'].str.upper().str.replace(':', ' ')
node_metadata['node_dbxrefs'] = node_metadata['node_dbxrefs'].apply(codeReplacements)
node_metadata.loc[(node_metadata.node_dbxrefs == 'NONE'),'node_dbxrefs'] = np.NaN

# CodeID
node_metadata['node_id'] = node_metadata['node_id'].str.replace('_', ' ').str.split('/').str[-1]
node_metadata['node_id'] = node_metadata['node_id'].apply(codeReplacements)

# Unwrap Series
node_metadata['node_synonyms'] = node_metadata['node_synonyms'].str.split('|')
node_metadata['node_dbxrefs'] = node_metadata['node_dbxrefs'].str.split('|')

# Add SAB and CODE columns
node_metadata['SAB'] = node_metadata['node_id'].str.split(' ').str[0]
node_metadata['CODE'] = node_metadata['node_id'].str.split(' ').str[-1]
del node_metadata['node_namespace']

node_metadata.tail()

Unnamed: 0,node_id,node_label,node_synonyms,node_dbxrefs,SAB,CODE
20295,UBERON 0013140,systemic vein,[systemic venous tree organ part],"[UMLS C0447117, FMA 66644, NCI C33719, SCTID 244389004]",UBERON,13140
20296,UBERON 4100013,postcoracoid,,,UBERON,4100013
20297,CL 0000980,plasmablast,"[cd20-negative b cell, cd27-positive, cd38-positive, cd20-negative b cell]",[FMA 84371],CL,980
20298,PR 000001879,leukosialin,,,PR,1879
20299,UBERON 3000344,orbitonasal foramen,,,UBERON,3000344


### Get the UMLS CUIs for each node_id as nodeCUIs

In [8]:
explode_dbxrefs = node_metadata.explode('node_dbxrefs')[['node_id','node_dbxrefs']]
explode_dbxrefs['nodeXrefCodes'] = explode_dbxrefs['node_dbxrefs'].str.split(' ').str[-1]

explode_dbxrefs_UMLS = explode_dbxrefs[explode_dbxrefs['node_dbxrefs'].str.contains('UMLS C') == True].groupby('node_id')['nodeXrefCodes'].apply(list).reset_index(name='nodeCUIs')
node_metadata = node_metadata.merge(explode_dbxrefs_UMLS, how='left', on='node_id')
del explode_dbxrefs_UMLS
del explode_dbxrefs['nodeXrefCodes']
#del explode_dbxrefs

### Get the UMLS CUIs for each node_id from CUI-CODEs file as CUIcodes

In [9]:
CUIcodes = pd.read_csv("CUI-CODEs.csv")

In [10]:
CODE_CUIs = CUIcodes.sort_values([':END_ID',':START_ID']).groupby(':END_ID')[':START_ID'].apply(list).reset_index(name='CUIcodes')
node_metadata = node_metadata.merge(CODE_CUIs, how='left', left_on='node_id', right_on=':END_ID')
del CODE_CUIs
del node_metadata[':END_ID']

### Add column for Xref's CUIs - merge exploded_node_metadata with CUI_CODEs then group and merge with node_metadata

In [11]:
node_xref_cui = explode_dbxrefs.merge(CUIcodes, how='inner', left_on='node_dbxrefs', right_on=':END_ID')
node_xref_cui = node_xref_cui.sort_values(['node_id',':START_ID']).groupby('node_id')[':START_ID'].apply(list).reset_index(name='XrefCUIs')
def setfunction(x):
   return set(x)
node_xref_cui['XrefCUIs'] = list(map(setfunction, node_xref_cui['XrefCUIs']))
node_xref_cui['XrefCUIs'] = node_xref_cui['XrefCUIs'].apply(list)
node_metadata = node_metadata.merge(node_xref_cui, how='left', on='node_id')

### Add column for base64 CUIs 

In [12]:
def base64it(x):
   return [base64.urlsafe_b64encode(str(x).encode('UTF-8')).decode('ascii')]
node_metadata['base64cui'] = node_metadata['node_id'].apply(base64it)
node_metadata

Unnamed: 0,node_id,node_label,node_synonyms,node_dbxrefs,SAB,CODE,nodeCUIs,CUIcodes,XrefCUIs,base64cui
0,GO GO:0018958,phenol-containing compound metabolic process,,,GO,GO:0018958,,[C1156494],,[R08gR086MDAxODk1OA==]
1,GO GO:0060096,"serotonin secretion, neurotransmission",,,GO,GO:0060096,,"[C2262837, C3268712]",,[R08gR086MDA2MDA5Ng==]
2,UBERON 0001110,thyrohyoid muscle,"[thyreohyoideus muscle, musculus thyrohyoideus, thyreohyoideus, thyrohyoideus, thyrohyoid]","[FMA 13344, MA 0002393, EMAPA 19273, NCI C53178, UMLS C0224167, SCTID 244832006, WIKIPEDIA THYROHYOID_MUSCLE]",UBERON,0001110,[C0224167],,[C0224167],[VUJFUk9OIDAwMDExMTA=]
3,UBERON 0007837,thoracic spinal cord ventral commissure,[thoracic spinal cord anterior commissure],[BIRNLEX 1556],UBERON,0007837,,,,[VUJFUk9OIDAwMDc4Mzc=]
4,UBERON 0002441,cervicothoracic ganglion,"[stellate ganglion, ganglion stellatum, cervicothoracic sympathetic ganglion, ganglion cervicothoracicum]","[BAMS GSTL, FMA 6469, GAID 711, WIKIPEDIA CERVICOTHORACIC_GANGLION, SCTID 181101006, MA 0001157, BTO 0001815, EMAPA 18226, MSH D013233]",UBERON,0002441,,,[C0038246],[VUJFUk9OIDAwMDI0NDE=]
...,...,...,...,...,...,...,...,...,...,...
20295,UBERON 0013140,systemic vein,[systemic venous tree organ part],"[UMLS C0447117, FMA 66644, NCI C33719, SCTID 244389004]",UBERON,0013140,[C0447117],,[C0447117],[VUJFUk9OIDAwMTMxNDA=]
20296,UBERON 4100013,postcoracoid,,,UBERON,4100013,,,,[VUJFUk9OIDQxMDAwMTM=]
20297,CL 0000980,plasmablast,"[cd20-negative b cell, cd27-positive, cd38-positive, cd20-negative b cell]",[FMA 84371],CL,0000980,,,[C0229657],[Q0wgMDAwMDk4MA==]
20298,PR 000001879,leukosialin,,,PR,000001879,,,,[UFIgMDAwMDAxODc5]


### Add cuis list and preferred cui to complete the node "atoms" (code, label, syns, xrefs, cuis, CUI)

In [13]:
# create correct length lists
node_metadata['cuis'] = node_metadata['base64cui']
node_metadata['CUI'] = node_metadata['base64cui']

# iterate to join list across row
for index, rows in node_metadata.iterrows():
    rows.cuis = [rows.nodeCUIs, rows.CUIcodes, rows.XrefCUIs, rows.base64cui]
    
    # remove duplicates in row.cuis - can't use set because order of items matters
    result = []
    for i in rows.cuis:
        if i not in result:
            result.append(i)
    rows.cuis = result
    
# remove nan and flatten and remove duplicates
node_metadata['cuis'] = node_metadata['cuis'].apply(lambda x: [i for i in x if i == i])
node_metadata['cuis'] = node_metadata['cuis'].apply(lambda x: [i for row in x for i in row])

# iterate again (ugh) to select first item
for index, rows in node_metadata.iterrows():
    rows.CUI = rows.cuis[0]

node_metadata

Unnamed: 0,node_id,node_label,node_synonyms,node_dbxrefs,SAB,CODE,nodeCUIs,CUIcodes,XrefCUIs,base64cui,cuis,CUI
0,GO GO:0018958,phenol-containing compound metabolic process,,,GO,GO:0018958,,[C1156494],,[R08gR086MDAxODk1OA==],"[C1156494, R08gR086MDAxODk1OA==]",C1156494
1,GO GO:0060096,"serotonin secretion, neurotransmission",,,GO,GO:0060096,,"[C2262837, C3268712]",,[R08gR086MDA2MDA5Ng==],"[C2262837, C3268712, R08gR086MDA2MDA5Ng==]",C2262837
2,UBERON 0001110,thyrohyoid muscle,"[thyreohyoideus muscle, musculus thyrohyoideus, thyreohyoideus, thyrohyoideus, thyrohyoid]","[FMA 13344, MA 0002393, EMAPA 19273, NCI C53178, UMLS C0224167, SCTID 244832006, WIKIPEDIA THYROHYOID_MUSCLE]",UBERON,0001110,[C0224167],,[C0224167],[VUJFUk9OIDAwMDExMTA=],"[C0224167, VUJFUk9OIDAwMDExMTA=]",C0224167
3,UBERON 0007837,thoracic spinal cord ventral commissure,[thoracic spinal cord anterior commissure],[BIRNLEX 1556],UBERON,0007837,,,,[VUJFUk9OIDAwMDc4Mzc=],[VUJFUk9OIDAwMDc4Mzc=],VUJFUk9OIDAwMDc4Mzc=
4,UBERON 0002441,cervicothoracic ganglion,"[stellate ganglion, ganglion stellatum, cervicothoracic sympathetic ganglion, ganglion cervicothoracicum]","[BAMS GSTL, FMA 6469, GAID 711, WIKIPEDIA CERVICOTHORACIC_GANGLION, SCTID 181101006, MA 0001157, BTO 0001815, EMAPA 18226, MSH D013233]",UBERON,0002441,,,[C0038246],[VUJFUk9OIDAwMDI0NDE=],"[C0038246, VUJFUk9OIDAwMDI0NDE=]",C0038246
...,...,...,...,...,...,...,...,...,...,...,...,...
20295,UBERON 0013140,systemic vein,[systemic venous tree organ part],"[UMLS C0447117, FMA 66644, NCI C33719, SCTID 244389004]",UBERON,0013140,[C0447117],,[C0447117],[VUJFUk9OIDAwMTMxNDA=],"[C0447117, VUJFUk9OIDAwMTMxNDA=]",C0447117
20296,UBERON 4100013,postcoracoid,,,UBERON,4100013,,,,[VUJFUk9OIDQxMDAwMTM=],[VUJFUk9OIDQxMDAwMTM=],VUJFUk9OIDQxMDAwMTM=
20297,CL 0000980,plasmablast,"[cd20-negative b cell, cd27-positive, cd38-positive, cd20-negative b cell]",[FMA 84371],CL,0000980,,,[C0229657],[Q0wgMDAwMDk4MA==],"[C0229657, Q0wgMDAwMDk4MA==]",C0229657
20298,PR 000001879,leukosialin,,,PR,000001879,,,,[UFIgMDAwMDAxODc5],[UFIgMDAwMDAxODc5],UFIgMDAwMDAxODc5


### Join CUI from node_metadata to each of edgelist subject and object

### Outer join when appropriate to original csvs and then add data for each csv