# Standard Ontologies
* SNOMED
* RADLEX
* ICD-10
* MESH

For the above ontologies, produce pandas DFs or json files to be read into a knowledge base for entity linking

These dictionaries should have a 
* cui (concept unique id)
* a string for that concept
* whether it is the preferred name
* tui (type id)
* ontology to which is belongs
* type name
* link to other entity, is_a relationships or part_of relationships if present

As these ontologies are very long, some pruning may be necessary. 

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import xml.etree.ElementTree as et
import re
from tqdm import tqdm
import json

In [2]:
import spacy

In [3]:
import neuroNLP

In [25]:
from neuroNLP.ontology import Ontology

# SNOMED dictionary
extract information from the SNOMED dictionary such that
- structured format
- access ids for concepts
- access relationships in a tree-like manner
- eventually use within a knowledge base

this section is taken from the medcat snomed analysis  Exploring a SNOMED-CT uk extension Release

## importing & joining SNOMED dictionaries
extract and concantenate the concepts, descriptions, and relations form both UK and international editions. only pull out active entries.
- read all fields as strings rather than integers

In [6]:
UK_SNOMED_PATH = "/home/hwatkins/Desktop/external_datasets/SNOMED/SnomedCT_UKClinicalRF2/Snapshot/Terminology/"
UK_descr = UK_SNOMED_PATH + "sct2_Description_Snapshot-en_GB1000000_20191001.txt"
UK_rel = UK_SNOMED_PATH + "sct2_Relationship_Snapshot_GB1000000_20191001.txt"
UK_conc = UK_SNOMED_PATH + "sct2_Concept_Snapshot_GB1000000_20191001.txt"

In [7]:
INT_SNOMED_PATH = "/home/hwatkins/Desktop/external_datasets/SNOMED/SnomedCT_InternationalRF2/Snapshot/Terminology/"
INT_descr = INT_SNOMED_PATH + "sct2_Description_Snapshot-en_INT_20180731.txt"
INT_rel = INT_SNOMED_PATH + "sct2_Relationship_Snapshot_INT_20180731.txt"
INT_conc = INT_SNOMED_PATH + "sct2_Concept_Snapshot_INT_20180731.txt"

In [8]:
uk_descr = pd.read_csv(UK_descr, sep='\t', dtype=object)
uk_rel = pd.read_csv(UK_rel, sep='\t', dtype=object)
uk_conc = pd.read_csv(UK_conc, sep='\t', dtype=object)

In [9]:
int_descr = pd.read_csv(INT_descr, sep='\t', dtype=object)
int_rel = pd.read_csv(INT_rel, sep='\t', dtype=object)
int_conc = pd.read_csv(INT_conc, sep='\t', dtype=object)

In [10]:
descs = pd.concat([int_descr, uk_descr])
active_descs = descs[descs.active == '1']

In [11]:
terms = pd.concat([int_conc, uk_conc])
active_terms = terms[terms.active == '1']

In [12]:
rels = pd.concat([int_rel, uk_rel])
active_relat = rels[rels.active == '1']

merge the terms and their fully specified name together (900000000000003001 = fully specified name)

In [13]:
terms_with_desc = pd.merge(active_terms, active_descs[active_descs['typeId'] == '900000000000003001'], left_on=['id'], right_on=['conceptId'], how='inner')

add type name from the fully specified name

In [14]:
terms_with_desc['type_name'] = terms_with_desc['term'].str.extract(r"\((\w+\s?.?\s?\w+.?\w+.?\w+.?)\)$")

In [15]:
merge_dfs = pd.merge(active_terms, active_descs, left_on=['id'], right_on=['conceptId'], how='inner')
active_with_primary_desc = merge_dfs[merge_dfs['typeId'] == '900000000000003001']
active_with_primary_desc = active_with_primary_desc.drop_duplicates(['id_x'], keep='first')
active_with_synonym_desc = merge_dfs[merge_dfs['typeId'] == '900000000000013009']
active_with_all_desc = pd.concat([active_with_primary_desc, active_with_synonym_desc])

In [16]:
snomed_cdb_df = pd.merge(active_with_all_desc, terms_with_desc, left_on=['id_x'], right_on=['conceptId'], how='inner')

clean up dataframe

In [17]:
snomed_cdb_df = snomed_cdb_df.loc[:, ['id_x_x','term_x','typeId_x','type_name']]
snomed_cdb_df.columns = ['cui', 'concept_name', 'is_preferred_name', 'type_name']
snomed_cdb_df['ontology'] = 'SNOMED-CT'
snomed_cdb_df['is_preferred_name'] = snomed_cdb_df['is_preferred_name'].replace(['900000000000003001', '900000000000013009'], [1, 0])

In [18]:
snomed_cdb_df.head()

Unnamed: 0,cui,concept_name,is_preferred_name,type_name,ontology
0,101009,Quilonia ethiopica (organism),1,organism,SNOMED-CT
1,101009,Quilonia ethiopica,0,organism,SNOMED-CT
2,102002,Hemoglobin Okaloosa (substance),1,substance,SNOMED-CT
3,102002,Hemoglobin Okaloosa,0,substance,SNOMED-CT
4,102002,"Hb 48(CD7), Leu-arg",0,substance,SNOMED-CT


assign type ids

In [19]:
terms_dict = {
    "00000":"SNOMED RT+CTV3",
    "01000":"body structure",
    "01100":"morphologic abnormality",
    "01200":"cell structure",
    "01210":"cell",
    "02000":"finding",
    "02100":"disorder",
    "03000":"environment / location",
    "03100":"environment",
    "03200":"geographic location",
    "04000":"event",
    "05000":"observable entity",
    "06000":"organism",
    "07000":"product",
    "07100":"medicinal product",
    "07110":"medicinal product form",
    "07111":"clinical drug",
    "08000":"physical force",
    "09000":"physical object",
    "10000":"procedure",
    "10100":"regime/therapy",
    "11000":"qualifier value",
    "11100":"administration method",
    "11200":"disposition",
    "11300":"intended site",
    "11010":"number",
    "11400":"release characteristic",
    "11500":"transformation",
    "11020":"basic dose form",
    "11030":"dose form",
    "11600":"role",
    "11700":"state of matter",
    "11040":"unit of presentation",
    "12000":"record artifact",
    "13000":"situation",
    "14000":"metadata",
    "14100":"core metadata concept",
    "14200":"foundation metadata concept",
    "14300":"linkage concept",
    "14310":"attribute",
    "14320":"link assertion",
    "14400":"namespace concept",
    "14500":"OWL metadata concept",
    "15000":"social concept",
    "15100":"life style",
    "15010":"racial group",
    "15020":"ethnic group",
    "15200":"occupation",
    "15300":"person",
    "15400":"religion/philosophy",
    "16000":"special concept",
    "16100":"inactive concept",
    "16200":"navigational concept",
    "17000":"specimen",
    "18000":"staging scale",
    "18100":"assessment scale",
    "18200":"tumor staging",
    "19000":"substance",
}

In [20]:
# Add tui codes
dict2 = {v : k for k, v in terms_dict.items()}
snomed_cdb_df["tui"] = snomed_cdb_df["type_name"].map(dict2)

In [21]:
snomed_cdb_df.head()

Unnamed: 0,cui,concept_name,is_preferred_name,type_name,ontology,tui
0,101009,Quilonia ethiopica (organism),1,organism,SNOMED-CT,6000
1,101009,Quilonia ethiopica,0,organism,SNOMED-CT,6000
2,102002,Hemoglobin Okaloosa (substance),1,substance,SNOMED-CT,19000
3,102002,Hemoglobin Okaloosa,0,substance,SNOMED-CT,19000
4,102002,"Hb 48(CD7), Leu-arg",0,substance,SNOMED-CT,19000


is_a relationships in the SNOMED dictionary have the typeId 116680003

In [22]:
is_a_rels = active_relat[active_relat['typeId']=='116680003']

In [23]:
active_relat['typeId'].value_counts()

116680003    549227
363698007     89321
260686004     73279
116676008     64817
123005000     42393
              ...  
246112005        14
704326004        12
718497002         9
726633004         6
719715003         6
Name: typeId, Length: 89, dtype: int64

## Adding relations

In [24]:
for idx in active_relat['typeId'].unique():
    print(snomed_cdb_df[snomed_cdb_df['cui']== idx]['concept_name'].values)

['Is a (attribute)' 'Is a']
['Finding site (attribute)' 'Finding site']
['Part of (attribute)' 'Part of']
['Has intent (attribute)' 'Has intent' 'Intent']
['Method (attribute)' 'Method']
['Interprets (attribute)' 'Interprets']
['Causative agent (attribute)' 'Causative agent']
['Procedure site (attribute)' 'Procedure site']
['Associated morphology (attribute)' 'Associated morphology' 'Morphology']
['Laterality (attribute)' 'Laterality']
['Occurrence (attribute)' 'Occurrence']
['Direct device (attribute)' 'Direct device']
['Direct morphology (attribute)' 'Direct morphology']
['Access (attribute)' 'Access']
['Revision status (attribute)' 'Revision status']
['Priority (attribute)' 'Priority']
['Direct substance (attribute)' 'Direct substance']
['Has focus (attribute)' 'Has focus']
['Associated finding (attribute)' 'Associated finding']
['Component (attribute)' 'Component']
['Has interpretation (attribute)' 'Has interpretation']
['Has specimen (attribute)' 'Has specimen']
['Indirect morphol

for now, focus only on is-a relationships

In [25]:
# to get the is-a relationship (the parents of a concept)
is_a_rels[is_a_rels['sourceId']=='17']['destinationId'].values
# to get the is-a children of a concept
is_a_rels[is_a_rels['destinationId']=='17']['sourceId'].values

array([], dtype=object)

save these dataframes

In [26]:
snomed_cdb_df.to_csv('/home/hwatkins/Desktop/neuroNLP_assets/data/full_snomed_ct.csv')
is_a_rels.to_csv('/home/hwatkins/Desktop/neuroNLP_assets/data/full_snomed_ct_is_a_relations.csv')

## Filtering snomed for specific types

In [27]:
snomed_cdb_df['type_name'].unique()

array(['organism', 'substance', 'procedure', 'body structure', 'disorder',
       'occupation', 'finding', 'qualifier value',
       'morphologic abnormality', 'cell structure', 'physical object',
       'regime/therapy', 'product', 'medicinal product', 'cell', 'person',
       'ethnic group', 'environment', 'observable entity', 'event',
       'religion/philosophy', 'attribute', 'physical force', 'situation',
       'medicinal product form', 'navigational concept', 'clinical drug',
       'social concept', 'tumor staging', 'specimen', 'basic dose form',
       'life style', 'dose form', 'linkage concept', 'staging scale',
       'record artifact', 'assessment scale', 'SNOMED RT+CTV3',
       'geographic location', 'environment / location',
       'inactive concept', 'special concept', 'namespace concept',
       'racial group', 'link assertion', 'foundation metadata concept',
       'core metadata concept', 'disposition', 'unit of presentation',
       'OWL metadata concept', 'number'

types to keep:
- 'substance'
- 'body structure'
- 'finding'
- 'event'
- 'disorder'
- 'observable entity'
- 'morphologic abnormality'
- 'SNOMED RT+CTV3'

In [28]:
accepted_types = ['substance',
                  'body structure',
                  'finding','event',
                  'disorder',
                  'observable entity',
                  'morphologic abnormality',
                  'SNOMED RT+CTV3']

In [29]:
filtered_snomed_df = snomed_cdb_df[snomed_cdb_df['type_name'].isin(accepted_types)]

In [31]:
filtered_is_a_rels = is_a_rels[(is_a_rels['sourceId'].isin(filtered_snomed_df['cui'])) & (is_a_rels['destinationId'].isin(filtered_snomed_df['cui'])) ]

In [8]:
FILTERED_SNOMED_PATH = '/home/hwatkins/Desktop/neuroNLP_assets/data/ontology_data/filtered_snomed_ct.csv'
FILTERED_SNOMED_RELS_PATH = '/home/hwatkins/Desktop/neuroNLP_assets/data/ontology_data/filtered_snomed_ct_is_a_relations.csv'

In [32]:
filtered_snomed_df.to_csv(FILTERED_SNOMED_PATH)
filtered_is_a_rels.to_csv(FILTERED_SNOMED_RELS_PATH)

In [32]:
filtered_snomed_df.head()

Unnamed: 0,cui,concept_name,is_preferred_name,type_name,ontology,tui
2,102002,Hemoglobin Okaloosa (substance),1,substance,SNOMED-CT,19000
3,102002,Hemoglobin Okaloosa,0,substance,SNOMED-CT,19000
4,102002,"Hb 48(CD7), Leu-arg",0,substance,SNOMED-CT,19000
5,102002,Haemoglobin Okaloosa,0,substance,SNOMED-CT,19000
11,106004,Structure of posterior carpal region (body str...,1,body structure,SNOMED-CT,1000


In [33]:
filtered_is_a_rels.head()

Unnamed: 0,id,effectiveTime,active,moduleId,sourceId,destinationId,relationshipGroup,typeId,characteristicTypeId,modifierId
1,101021,20020131,1,900000000000207008,10000006,29857009,0,116680003,900000000000011006,900000000000451002
2,102025,20020131,1,900000000000207008,10000006,9972008,0,116680003,900000000000011006,900000000000451002
13,114022,20020131,1,900000000000207008,134035007,84371003,0,116680003,900000000000011006,900000000000451002
26,127021,20020131,1,900000000000207008,134136005,57250008,0,116680003,900000000000011006,900000000000451002
40,141028,20020131,1,900000000000207008,135161004,136248004,0,116680003,900000000000011006,900000000000451002


## Filtering SNOMED for neurology
- use the medcat-labelled reports to get codes that are specific to neurology, only include these and their ancestors
- also do not include substance, event or observable entity types

In [9]:
filtered_snomed_df = pd.read_csv(FILTERED_SNOMED_PATH, dtype=object, index_col="Unnamed: 0")
filtered_is_a_rels = pd.read_csv(FILTERED_SNOMED_RELS_PATH, dtype=object, index_col="Unnamed: 0")

In [10]:
MEDCAT_DATA_PATH = "/home/hwatkins/Desktop/neuroNLP_assets/data/medcat_snomed_annotated_reports_10k.json"

In [11]:
with open(MEDCAT_DATA_PATH, "r") as file:
    medcat_labelled_data = json.load(file)

In [12]:
medcat_labelled_data[0]

{'text': "T1-SE Sagittal None T2-TSE Transverse None FLAIR Coronal None DIFFUSION Transverse None Indication: Atypical migraine/TAC, new onset. Findings: Normal intracranial appearances. In particular, normal appearances of the pons, cisternal portions of the trigeminal nerves and Meckel's cave bilaterally. Dr Harpreet Hyare Locum Consultant Neuroradiologist 10 June 2009",
 'labels': [[6, 14, '30730003'],
  [27, 37, '62824007'],
  [49, 56, '81654009'],
  [62, 71, '46638006'],
  [72, 82, '62824007'],
  [100, 117, '56097005'],
  [123, 126, '7147002'],
  [127, 132, '370139004'],
  [134, 142, '163121000000106'],
  [144, 150, '17621005'],
  [151, 163, '303231004'],
  [164, 175, '255412001'],
  [192, 210, '386549008'],
  [218, 222, '49557009'],
  [250, 267, '27612005'],
  [281, 285, '285067000'],
  [317, 322, '224936003'],
  [323, 333, '768839008']]}

Get all the unique snomed codes in the labelled data, then find filter the "filtered" snomed further, including only these codes and their ancestors

In [13]:
all_codes = []
for report in tqdm(medcat_labelled_data):
    for label in report["labels"]:
        all_codes.append(label[2])
unique_codes = list(set(all_codes))

100%|██████████| 10000/10000 [00:00<00:00, 272723.99it/s]


In [14]:
len(unique_codes)

5267

In [15]:
kb = KnowledgeBase(FILTERED_SNOMED_PATH, FILTERED_SNOMED_RELS_PATH)

In [16]:
kb.is_in(unique_codes[1000])

True

In [18]:
def get_all_ancestors(cui):
    ancestors = []
    queue = [cui]
    while queue:
        current_cui = queue.pop(0)
        ancestors.append(current_cui)
        for parent in kb[current_cui].parents:
            queue.append(parent.cui)
    return ancestors

In [19]:
all_codes_with_ancestors = []
for code in tqdm(unique_codes):
    if kb.is_in(code):
        all_codes_with_ancestors.extend(get_all_ancestors(code))
unique_codes_with_ancestors = list(set(all_codes_with_ancestors))
    

100%|██████████| 5267/5267 [1:11:59<00:00,  1.22it/s]  


In [20]:
len(unique_codes_with_ancestors)

7310

In [22]:
neuro_snomed_df = filtered_snomed_df[filtered_snomed_df["cui"].isin(unique_codes_with_ancestors)]

In [23]:
len(neuro_snomed_df)

23306

In [24]:
neuro_is_a_rels = filtered_is_a_rels[(filtered_is_a_rels['sourceId'].isin(neuro_snomed_df['cui'])) & (filtered_is_a_rels['destinationId'].isin(neuro_snomed_df['cui'])) ]

In [25]:
len(neuro_is_a_rels)

11996

In [26]:
NEURO_SNOMED_PATH = '/home/hwatkins/Desktop/neuroNLP_assets/data/ontology_data/neuro_snomed_ct.csv'
NEURO_SNOMED_RELS_PATH = '/home/hwatkins/Desktop/neuroNLP_assets/data/ontology_data/neuro_snomed_ct_is_a_relations.csv'

In [27]:
neuro_snomed_df.to_csv(NEURO_SNOMED_PATH)
neuro_is_a_rels.to_csv(NEURO_SNOMED_RELS_PATH)

In [28]:
neurokb = KnowledgeBase(NEURO_SNOMED_PATH, NEURO_SNOMED_RELS_PATH)

In [36]:
neurokb.get_root().children[1].children[1].depth

1

# RADLEX dictionary

In [132]:
RADLEX_PATH = "/home/hwatkins/Desktop/external_datasets/RADLEX/RADLEX.csv"

In [133]:
radlex_df = pd.read_csv(RADLEX_PATH)

  interactivity=interactivity, compiler=compiler, result=result)


In [134]:
radlex_df.sample(5)

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Obsolete,CUI,Semantic Types,Parents,http://data.bioontology.org/metadata/prefixIRI,http://radlex.org/RID/AAL,...,http://radlex.org/RID/Surrounded_by,http://radlex.org/RID/Surrounds,http://radlex.org/RID/Synonym,http://radlex.org/RID/Synonym_German,http://radlex.org/RID/Talairach,http://radlex.org/RID/Term_type,http://radlex.org/RID/Tributary_Of,http://radlex.org/RID/UMLS_ID,http://radlex.org/RID/UMLS_Term,http://radlex.org/RID/Unsanctioned_Term
4024,http://radlex.org/RID/RID95,esophagus,oesophagus|esoph|oesophagus,Organ with organ cavity which is continuous su...,False,,,http://radlex.org/RID/RID13444,RID95,,...,,,oesophagus|esoph,oesophagus,,,,C0014876,,
35893,http://radlex.org/RID/RID12488,RID12488,,,False,,,http://radlex.org/RID/RID15849,RID12488,,...,,,,,,,,,,
27858,http://radlex.org/RID/RID37012,multiform layer of left Brodmann area 12,,,False,,,http://radlex.org/RID/RID37010,RID37012,,...,,,,,,,,,,
17218,http://radlex.org/RID/RID22944,posterior ramus of sacral nerve,,,False,,,http://radlex.org/RID/RID7440,RID22944,,...,,,,,,,,,,
33474,http://radlex.org/RID/RID39061,external anal sphincter,äußerer Analschließmuskel,,False,,,http://radlex.org/RID/RID39060,RID39061,,...,,,,äußerer Analschließmuskel,,,,,,


## Convert radlex to own format

In [135]:
relevant_radlex_df = radlex_df[['Class ID','Preferred Label','Synonyms']]

In [136]:
relevant_radlex_rels = radlex_df[['Class ID','Parents']]

In [137]:
relevant_radlex_df['cui'] = relevant_radlex_df['Class ID'].str.replace('http://radlex.org/RID/','')
relevant_radlex_df = relevant_radlex_df.drop(columns=['Class ID'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [138]:
preferred_names = relevant_radlex_df[['cui','Preferred Label']]
aliases = relevant_radlex_df[['cui','Synonyms']].dropna()

In [139]:
aliases['Synonyms'] = aliases['Synonyms'].str.split('|')

In [140]:
split_aliases = aliases.explode('Synonyms').reset_index(drop=True)

In [141]:
split_aliases['is_preferred_name'] = 0
preferred_names['is_preferred_name'] = 1

In [142]:
preferred_names = preferred_names.rename(columns={'Preferred Label':'concept_name'})
split_aliases = split_aliases.rename(columns={'Synonyms':'concept_name'})

In [143]:
preferred_names.head()

Unnamed: 0,cui,concept_name,is_preferred_name
0,RID49617,dialysis fistula,1
1,RID9383,hiatus for greater petrosal nerve,1
2,RID19144,rootlet of left fourth lumbar nerve,1
3,RID27415,thoracic segment of right ventral gray column ...,1
4,RID44540,synovial membrane of bursa,1


In [144]:
new_radlex_df = pd.concat([preferred_names, split_aliases], ignore_index=True)
new_radlex_df['type_name'] = None
new_radlex_df['ontology'] = 'RADLEX'
new_radlex_df['tui'] = None


In [145]:
new_radlex_df.sort_values(by=['cui'])

Unnamed: 0,cui,concept_name,is_preferred_name,type_name,ontology,tui
22494,Non-RadLex_term,Non-RadLex term,1,,RADLEX,
22198,RID0,Radlex ontology entity,1,,RADLEX,
27134,RID1,RadLex entity,1,,RADLEX,
7821,RID10,image quality,1,,RADLEX,
12040,RID1000,lingular inferior segment artery,1,,RADLEX,
...,...,...,...,...,...,...
56423,RID9998,Eustachian amygdala,0,,RADLEX,
19359,RID9998,tubal tonsil,1,,RADLEX,
56422,RID9998,auditory tube lymph gland,0,,RADLEX,
10788,RID9999,pharyngeal recess,1,,RADLEX,


## Adding relations

In [146]:
soures = relevant_radlex_rels['Class ID'].str.replace('http://radlex.org/RID/','')
sinks = relevant_radlex_rels['Parents'].str.replace('http://radlex.org/RID/','')

In [147]:
radlex_is_a_rels = pd.DataFrame()
radlex_is_a_rels["sourceId"] = soures
radlex_is_a_rels["destinationId"] = sinks

In [148]:
rel_ids = pd.Series(radlex_is_a_rels.index + len(radlex_is_a_rels))

In [149]:
radlex_is_a_rels["id"] = rel_ids

## Adding types
USe the top-level layer in the hierarchy as the type name

In [150]:
new_radlex_df[new_radlex_df["cui"]=="RID5|RID3"]

Unnamed: 0,cui,concept_name,is_preferred_name,type_name,ontology,tui


In [158]:
radlex_is_a_rels[radlex_is_a_rels["destinationId"].str.len()>8]

Unnamed: 0,sourceId,destinationId,id


Remove the edge above RID1

In [152]:
radlex_is_a_rels = radlex_is_a_rels.drop([22198])

Replace an error parent

In [157]:
radlex_is_a_rels.loc[3272] = "RID50364", "RID5", "49908"

In [159]:
RADLEX_PATH = '/home/hwatkins/Desktop/neuroNLP_assets/data/ontology_data/full_radlex.csv'
RADLEX_REL_PATH = '/home/hwatkins/Desktop/neuroNLP_assets/data/ontology_data/full_radlex_is_a_rels.csv'

In [160]:
new_radlex_df.to_csv(RADLEX_PATH)
radlex_is_a_rels.to_csv(RADLEX_REL_PATH)

Find the top-level layer concept for each entity in the ontology

In [161]:
kb = KnowledgeBase(RADLEX_PATH, RADLEX_REL_PATH)

In [162]:
kb["RID50364"].parents

[imaging observation, cui: RID5]

In [163]:
def find_type(cui):
    concept = kb[cui]
    depth = concept.depth
    for i in range(depth-1):
        concept = concept.parents[0]
    return concept.name

In [166]:
type_names = pd.Series([find_type(cui) for cui in tqdm(new_radlex_df['cui'])])

100%|██████████| 70609/70609 [16:15<00:00, 72.37it/s]


In [167]:
new_radlex_df["type_name"] = type_names

In [196]:
tuis = dict(enumerate(new_radlex_df["type_name"].unique()))

In [198]:
tuis = {name: "T0"+str(i) for i, name in tuis.items()}

In [200]:
new_radlex_df["tui"] = pd.Series([tuis[name] for name in new_radlex_df["type_name"]])

In [207]:
new_radlex_df.to_csv(RADLEX_PATH)
radlex_is_a_rels.to_csv(RADLEX_REL_PATH)

# ICD-10 2 SNOMED 
and vice versa. Extract the SNOMED- ICD10 mapping from the release and put into usable csv format to read into pandas

In [210]:
ICD10MAP_PATH = "/home/hwatkins/Desktop/external_datasets/SNOMED/SnomedCT_UKClinicalRF2/Snapshot/Refset/Map/der2_iisssciRefset_ExtendedMapSnapshot_GB1000000_20191001.txt"

In [211]:
uk_snomed_icd10 = pd.read_csv(ICD10MAP_PATH, sep='\t', dtype=object)

In [217]:
uk_snomed_icd10.head()

Unnamed: 0,id,effectiveTime,active,moduleId,refsetId,referencedComponentId,mapGroup,mapPriority,mapRule,mapAdvice,mapTarget,correlationId,mapBlock
0,000004f9-03b1-5c00-80a2-935a3d38b07a,20181001,0,999000031000000106,999002741000000101,8920111000001106,1,1,,Not in scope of the classification,#NIS,447561005,1
1,000005a5-575c-5dae-b3ea-d3fd4d5e6558,20181001,0,999000031000000106,999002741000000101,15801211000001100,1,1,,Not in scope of the classification,#NIS,447561005,1
2,000008af-5a22-5492-916b-5a98568bbecb,20171001,0,999000031000000106,999002741000000101,180598008,1,1,,Additional code mandatory for Y464,Y464,447561005,1
3,00000e10-5bd0-5de1-8980-2b4429df14d5,20170401,0,999000031000000106,999002271000000101,215429004,1,7,,Additional code mandatory for V909 . Supplemen...,V909,447561005,1
4,00001322-3e86-5db0-8604-794b006b2e90,20181001,0,999000031000000106,999002271000000101,15707311000001104,1,1,,Not in scope of the classification,#NIS,447561005,1


In [238]:
relevant_columns = uk_snomed_icd10[["id", "referencedComponentId", "mapTarget"]]

In [239]:
snomed_icd10_map = relevant_columns.rename(columns = {"referenceComponentId": "snomed_id", "mapTarget":"icd10_id"})

In [240]:
active_snomed_icd10_map = snomed_icd10_map[snomed_icd10_map["icd10_id"]!="#NIS"]
active_snomed_icd10_map = active_snomed_icd10_map[active_snomed_icd10_map["icd10_id"]!="#HLT"]
active_snomed_icd10_map = active_snomed_icd10_map[active_snomed_icd10_map["icd10_id"]!="#NC"]

In [241]:
active_snomed_icd10_map["icd10_id"].value_counts()

Y981      5722
Y534      3419
Y971      3370
Y973      3369
Y972      2524
          ... 
K231 A       1
D619D        1
W059         1
W147         1
I608D        1
Name: icd10_id, Length: 35336, dtype: int64

In [244]:
active_snomed_icd10_map.to_csv("/home/hwatkins/Desktop/neuroNLP_assets/data/ontology_data/snomed_icd10_map.csv")

# Mesh

In [5]:
MESH_PATH = "/home/hwatkins/Desktop/neuroOnto/source_data/MESH/MESH.csv"

In [8]:
mesh_df = pd.read_csv(MESH_PATH, dtype=object)

In [9]:
len(mesh_df)

347692

In [28]:
mesh_df.columns

Index(['Class ID', 'Preferred Label', 'Synonyms', 'Definitions', 'Obsolete',
       'CUI', 'Semantic Types', 'Parents', 'AN', 'AQL', 'CX', 'DC', 'DQ', 'DX',
       'EC', 'FX', 'Has mapping qualifier', 'HM', 'HN', 'II', 'Inverse of AQ',
       'Inverse of isa', 'Inverse of QB', 'Inverse of RB', 'Inverse of RO',
       'isa', 'LT', 'Machine permutation', 'Mapped from', 'Mapped to',
       'Mapping qualifier of', 'MDA', 'MeSH Frequency', 'MMR', 'MN', 'OL',
       'PA', 'PI', 'RR', 'SC', 'Scope Statement',
       'Semantic type UMLS property', 'SRC', 'TERMUI', 'TH'],
      dtype='object')

In [75]:
mesh_df.sample(10)

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Obsolete,CUI,Semantic Types,Parents,AN,AQL,...,OL,PA,PI,RR,SC,Scope Statement,Semantic type UMLS property,SRC,TERMUI,TH
80056,http://purl.bioontology.org/ontology/MESH/D002193,Cantharidin,"Cantharides|Cantharidine|4,7-Epoxyisobenzofura...","A toxic compound, isolated from the Spanish fl...",False,C0006884|C0006886,http://purl.bioontology.org/ontology/STY/T204|...,http://purl.bioontology.org/ontology/MESH/D001572,,AA AD AE AN CH CL CS EC HI IP ME PD PK PO RE S...,...,use CANTHARIDIN to search CANTHARIDES 1966-78,D007509|D004791,,56-25-7 (Cantharidin),,,http://purl.bioontology.org/ontology/STY/T204|...,,T006269|T006271|T006270,FDA SRS (2014)|UNK (19XX)|NLM (1966)
305406,http://purl.bioontology.org/ontology/MESH/C053477,"adhesin, Rhizobium","rhicadhesin|Rap A protein, Rhizobium|Rhizobium...",,False,C0050834,http://purl.bioontology.org/ontology/STY/T123|...,,,,...,,,*BACTERIAL PROTEINS (87-95),,1.0,involved in attachment of rhizobial cells to p...,http://purl.bioontology.org/ontology/STY/T123|...,J Bacteriol 1987;169(9):4294,T181508|T462923|T181511|T181509|T181510,NLM (1987)|NLM (2001)
213968,http://purl.bioontology.org/ontology/MESH/C000...,Microbotryum heliospermae,,,False,C3557791,http://purl.bioontology.org/ontology/STY/T004,,,,...,,,,,4.0,,http://purl.bioontology.org/ontology/STY/T004,,T001050358,NLM (2021)
341291,http://purl.bioontology.org/ontology/MESH/C573122,"MIRN431 microRNA, human","hsa-mir-431|microRNA-431, human|miR-431, human",,False,C3493022,http://purl.bioontology.org/ontology/STY/T123|...,,,,...,,,,,1.0,RefSeq NR_029965,http://purl.bioontology.org/ontology/STY/T123|...,Int J Oncol. 2012 May;40(5):1470-6.,T825286|T825285|T825284|T825283,NLM (2012)
81577,http://purl.bioontology.org/ontology/MESH/C017129,"cholestane-3,7,24,25-tetrol","5 beta-cholestane-3 alpha,7 alpha,24,25-tetrol",,False,C0607366,http://purl.bioontology.org/ontology/STY/T109,,,,...,,,,,1.0,,http://purl.bioontology.org/ontology/STY/T109,JBC 253(13):4688;1978,T097626|T097625,NLM (1978)
191476,http://purl.bioontology.org/ontology/MESH/C064212,poly(phenylalanyl-glutamyl-alanyl-glycine),poly(Phe-Glu-Ala-Gly)|poly(Phe-G-A-Gly),,False,C0641208,http://purl.bioontology.org/ontology/STY/T116,,,,...,,,*Peptides (1990-2020),,1.0,,http://purl.bioontology.org/ontology/STY/T116,Exp Clin Immunogenet 1986;3(1):54,T206962|T206961|T206960,NLM (1990)
165143,http://purl.bioontology.org/ontology/MESH/C550903,"GxTX-1E, Plesiophrictus guangxiensis","guangxitoxin-1E, Plesiophrictus guangxiensis|G...",,False,C2935428,http://purl.bioontology.org/ontology/STY/T116|...,,,,...,,,,,1.0,36-amino acid peptide that interacts with volt...,http://purl.bioontology.org/ontology/STY/T116|...,Biochemistry. 2010 Jun 29;49(25):5134-42,T000876890|T000876891|T774676,NLM (2015)|NLM (2010)
308995,http://purl.bioontology.org/ontology/MESH/C000...,"long non-coding RNA PANDAR, human","PANDAR lncRNA, human",,False,C4308447,http://purl.bioontology.org/ontology/STY/T123|...,,,,...,,,,,1.0,,http://purl.bioontology.org/ontology/STY/T123|...,Cell Death Dis. 2015 Feb 26;6:e1665.,T000911873|T000911874,NLM (2017)
325892,http://purl.bioontology.org/ontology/MESH/C000...,Psilocybe mexicana,,,False,C0319736,http://purl.bioontology.org/ontology/STY/T004,,,,...,,,,,4.0,,http://purl.bioontology.org/ontology/STY/T004,,T001070793,NLM (2021)
259915,http://purl.bioontology.org/ontology/MESH/C102760,"YfiK protein, Bacillus subtilis",,,False,C1307880,http://purl.bioontology.org/ontology/STY/T123|...,,,,...,,,,,1.0,similar to DegU protein; amino acid sequence i...,http://purl.bioontology.org/ontology/STY/T123|...,Gene 1996 Nov 28;181(1-2):147-51,T552038,NLM (2003)


In [47]:
relevant_mesh_cols = mesh_df[["Class ID", "Preferred Label", "Synonyms", "Semantic Types", "Parents"]]

In [48]:
syn_list_df = relevant_mesh_cols
syn_list_df["Synonyms"] = syn_list_df["Synonyms"].str.split("|")
exploded_syns_df = relevant_mesh_cols.explode("Synonyms")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  syn_list_df["Synonyms"] = syn_list_df["Synonyms"].str.split("|")


In [49]:
exploded_syns_df[exploded_syns_df["Class ID"].str.contains("D018365")]

Unnamed: 0,Class ID,Preferred Label,Synonyms,Semantic Types,Parents
325416,http://purl.bioontology.org/ontology/MESH/D018365,"Neoplasm, Residual",Residual Tumor,http://purl.bioontology.org/ontology/STY/T191,http://purl.bioontology.org/ontology/MESH/D009385
325416,http://purl.bioontology.org/ontology/MESH/D018365,"Neoplasm, Residual","Tumour, Residual",http://purl.bioontology.org/ontology/STY/T191,http://purl.bioontology.org/ontology/MESH/D009385
325416,http://purl.bioontology.org/ontology/MESH/D018365,"Neoplasm, Residual",Residual Cancer,http://purl.bioontology.org/ontology/STY/T191,http://purl.bioontology.org/ontology/MESH/D009385
325416,http://purl.bioontology.org/ontology/MESH/D018365,"Neoplasm, Residual",Residual Tumour,http://purl.bioontology.org/ontology/STY/T191,http://purl.bioontology.org/ontology/MESH/D009385
325416,http://purl.bioontology.org/ontology/MESH/D018365,"Neoplasm, Residual",Minimal Residual Disease,http://purl.bioontology.org/ontology/STY/T191,http://purl.bioontology.org/ontology/MESH/D009385
325416,http://purl.bioontology.org/ontology/MESH/D018365,"Neoplasm, Residual","Cancer, Residual",http://purl.bioontology.org/ontology/STY/T191,http://purl.bioontology.org/ontology/MESH/D009385
325416,http://purl.bioontology.org/ontology/MESH/D018365,"Neoplasm, Residual",Residual Neoplasms,http://purl.bioontology.org/ontology/STY/T191,http://purl.bioontology.org/ontology/MESH/D009385
325416,http://purl.bioontology.org/ontology/MESH/D018365,"Neoplasm, Residual",Residual Tumors,http://purl.bioontology.org/ontology/STY/T191,http://purl.bioontology.org/ontology/MESH/D009385
325416,http://purl.bioontology.org/ontology/MESH/D018365,"Neoplasm, Residual",Residual Cancers,http://purl.bioontology.org/ontology/STY/T191,http://purl.bioontology.org/ontology/MESH/D009385
325416,http://purl.bioontology.org/ontology/MESH/D018365,"Neoplasm, Residual",Residual Tumours,http://purl.bioontology.org/ontology/STY/T191,http://purl.bioontology.org/ontology/MESH/D009385


In [50]:
relevant_mesh_cols["is_preferred_name"] = "1"
exploded_syns_df["is_preferred_name"] = "0"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relevant_mesh_cols["is_preferred_name"] = "1"


In [51]:
relevant_mesh_cols = relevant_mesh_cols.drop(columns=["Synonyms"])
exploded_syns_df = exploded_syns_df.drop(columns=["Preferred Label"])
relevant_mesh_cols = relevant_mesh_cols.rename(columns={"Class ID": "cui", "Preferred Label":"concept_name", "Semantic Types":"tui"})
exploded_syns_df = exploded_syns_df.rename(columns={"Class ID": "cui", "Synonyms":"concept_name", "Semantic Types":"tui"})

In [52]:
exploded_syns_df

Unnamed: 0,cui,concept_name,tui,Parents,is_preferred_name
0,http://purl.bioontology.org/ontology/MESH/C000...,,http://purl.bioontology.org/ontology/STY/T004,,0
1,http://purl.bioontology.org/ontology/MESH/C000...,99mTc-HDP,http://purl.bioontology.org/ontology/STY/T130|...,,0
1,http://purl.bioontology.org/ontology/MESH/C000...,99mTc-hydroxyethylene-diphosphonate,http://purl.bioontology.org/ontology/STY/T130|...,,0
2,http://purl.bioontology.org/ontology/MESH/C585345,"Tardbpl-FL protein, zebrafish",http://purl.bioontology.org/ontology/STY/T123|...,,0
2,http://purl.bioontology.org/ontology/MESH/C585345,"Tardbpl protein, zebrafish",http://purl.bioontology.org/ontology/STY/T123|...,,0
...,...,...,...,...,...
347690,http://purl.bioontology.org/ontology/MESH/C089786,POG4POA(POG)5,http://purl.bioontology.org/ontology/STY/T116,,0
347691,http://purl.bioontology.org/ontology/MESH/C066675,"2,2',3,5,5',6-HCBP",http://purl.bioontology.org/ontology/STY/T109,,0
347691,http://purl.bioontology.org/ontology/MESH/C066675,PCB 151,http://purl.bioontology.org/ontology/STY/T109,,0
347691,http://purl.bioontology.org/ontology/MESH/C066675,PCB-151,http://purl.bioontology.org/ontology/STY/T109,,0


In [62]:
combined_df = pd.concat([exploded_syns_df, relevant_mesh_cols], ignore_index=True)

In [73]:
combined_df.sample(5)

Unnamed: 0,cui,concept_name,tui,is_preferred_name
944955,C032509,4-azidobenzylcarazolol,T130|T109,1
272162,C071124,mast cell degranulating peptide (polistes jadw...,T116,0
73388,C581631,"microrna-3940, human",T123|T114,0
432407,C070224,gly-ala-ile,T116,0
733467,C104647,"protein kinase, interferon-inducible double st...",T123|T116,0


In [65]:
combined_df.columns

Index(['cui', 'concept_name', 'tui', 'Parents', 'is_preferred_name'], dtype='object')

In [64]:
combined_df["cui"] = combined_df["cui"].str.replace("http://purl.bioontology.org/ontology/MESH/", "", regex=False)
combined_df["tui"] = combined_df["tui"].str.replace("http://purl.bioontology.org/ontology/STY/", "", regex=False)

In [69]:
combined_df["concept_name"] = combined_df["concept_name"].str.lower()

In [72]:
combined_df =combined_df.drop(columns=["Parents"])

In [80]:
type_rows = combined_df[combined_df["cui"].str.contains("http")]

In [83]:
type_rows["cui"] = type_rows["cui"].str.replace("http://purl.bioontology.org/ontology/STY/", "")

  type_rows["cui"] = type_rows["cui"].str.replace("http://purl.bioontology.org/ontology/STY/", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  type_rows["cui"] = type_rows["cui"].str.replace("http://purl.bioontology.org/ontology/STY/", "")


In [118]:
type_rows2 = type_rows[["cui", "concept_name"]]
type_rows2 = type_rows2.dropna()
type_mapping = type_rows2.to_records(index=False)

In [120]:
type_mapping = {a:b for a, b in type_mapping}

In [81]:
combined_df_no_types = combined_df[~combined_df["cui"].str.contains("http")]

In [98]:
combined_df_no_types["tui"] = combined_df_no_types["tui"].str.replace(r"\|.+", "", regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df_no_types["tui"] = combined_df_no_types["tui"].str.replace(r"\|.+", "", regex=True)


In [121]:
type_name_array = [type_mapping[tui] if tui in type_mapping.keys() else "NA" for tui in combined_df_no_types["tui"]]

In [123]:
combined_df_no_types["type_name"] = type_name_array

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df_no_types["type_name"] = type_name_array


In [138]:
mesh_df_concepts = combined_df_no_types[~combined_df_no_types["concept_name"].isna()]

In [139]:
len(mesh_df_concepts)

976929

In [140]:
mesh_df_concepts = mesh_df_concepts.reset_index(drop=True)

In [147]:
mesh_df_concepts["ontology"] = "MESH"

In [150]:
mesh_df_concepts.sample(10)

Unnamed: 0,cui,concept_name,tui,is_preferred_name,type_name,ontology
113123,C535386,"arthrogryposis, distal, type 6",T047,0,disease or syndrome,MESH
546220,D012988,"microbiology, soil",T067,0,phenomenon or process,MESH
303258,D005894,"ginseng, korean",T002,0,plant,MESH
122818,D005053,"eugenics, negative",T078,0,idea or concept,MESH
425736,C028174,12903,T109,0,organic chemical,MESH
805839,D013178,sports medicine,T091,1,biomedical occupation or discipline,MESH
353086,C529218,"fmd protein, mouse",T123,0,biologically active substance,MESH
148569,D000596,"acid, amino",T123,0,biologically active substance,MESH
463341,C101137,"rfabp protein, drosophila",T123,0,biologically active substance,MESH
415119,D009727,male nurse,T097,0,professional or occupational group,MESH


In [151]:
mesh_df_concepts_noc = mesh_df_concepts[~mesh_df_concepts["cui"].str.contains("C", regex=False)]

In [152]:
len(mesh_df_concepts_noc)

255178

In [153]:
mesh_df_concepts_noc.sample(10)

Unnamed: 0,cui,concept_name,tui,is_preferred_name,type_name,ontology
73668,D031841,monkshoods,T002,0,plant,MESH
458224,D014238,"trichloroacetate, sodium",T109,0,organic chemical,MESH
202039,D058066,"foot wart, bovine",T047,0,disease or syndrome,MESH
447341,D001022,aortic incompetence,T047,0,disease or syndrome,MESH
250049,D004268,single stranded dna binding proteins,T123,0,biologically active substance,MESH
54029,D019806,levcromakalim,T109,0,organic chemical,MESH
53689,D061387,chlamydophila pneumonia,T047,0,disease or syndrome,MESH
238877,D039603,eukaryotic peptide initiation factor-4g,T123,0,biologically active substance,MESH
255430,D000070660,"fungus, tuckahoe",T004,0,fungus,MESH
119557,D017076,cad-cam,T066,0,machine activity,MESH


In [154]:
mesh_df_concepts_noc = mesh_df_concepts_noc.reset_index(drop=True)

In [156]:
mesh_df_concepts_noc.to_csv("/home/hwatkins/Desktop/neuroOnto/ontologies/mesh/ontology_concepts.csv")

In [184]:
relation_cols = relevant_mesh_cols[["cui", "Parents"]]

In [185]:
relation_cols["cui"] = relation_cols["cui"].str.replace("http://purl.bioontology.org/ontology/MESH/", "")

  relation_cols["cui"] = relation_cols["cui"].str.replace("http://purl.bioontology.org/ontology/MESH/", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relation_cols["cui"] = relation_cols["cui"].str.replace("http://purl.bioontology.org/ontology/MESH/", "")


In [186]:
relation_cols["cui"] = relation_cols["cui"].str.replace("http://purl.bioontology.org/ontology/STY/", "")

  relation_cols["cui"] = relation_cols["cui"].str.replace("http://purl.bioontology.org/ontology/STY/", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relation_cols["cui"] = relation_cols["cui"].str.replace("http://purl.bioontology.org/ontology/STY/", "")


In [194]:
relation_cols_noc = relation_cols[~relation_cols["cui"].str.contains("C")]

In [195]:
relation_cols_nona = relation_cols_noc[~relation_cols_noc["Parents"].isna()]

In [199]:
relation_cols_nona["destinationId"] = relation_cols_nona["Parents"].str.replace("http://purl.bioontology.org/ontology/MESH/" ,"")

  relation_cols_nona["destinationId"] = relation_cols_nona["Parents"].str.replace("http://purl.bioontology.org/ontology/MESH/" ,"")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relation_cols_nona["destinationId"] = relation_cols_nona["Parents"].str.replace("http://purl.bioontology.org/ontology/MESH/" ,"")


In [201]:
relation_cols_nona["destinationId"] = relation_cols_nona["destinationId"].str.replace(r"\|.+", "", regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relation_cols_nona["destinationId"] = relation_cols_nona["destinationId"].str.replace(r"\|.+", "", regex=True)


In [202]:
relation_cols_nona

Unnamed: 0,cui,Parents,destinationId
26,D014437,http://purl.bioontology.org/ontology/MESH/D012282,D012282
44,D019074,http://purl.bioontology.org/ontology/MESH/D001158,D001158
82,D006371,http://purl.bioontology.org/ontology/MESH/D005741,D005741
83,D054881,http://purl.bioontology.org/ontology/MESH/D012...,D012732
86,D020880,http://purl.bioontology.org/ontology/MESH/D009414,D009414
...,...,...,...
347643,D000081006,http://purl.bioontology.org/ontology/MESH/D003951,D003951
347653,D004318,http://purl.bioontology.org/ontology/MESH/D013754,D013754
347659,D017713,http://purl.bioontology.org/ontology/MESH/D016913,D016913
347683,D007990,http://purl.bioontology.org/ontology/MESH/D007253,D007253


In [215]:
relation_cols_nona["destinationId"] = relation_cols_nona["destinationId"].str.replace("http://www.w3.org/2002/07/owl#Thing", "ROOT")

  relation_cols_nona["destinationId"] = relation_cols_nona["destinationId"].str.replace("http://www.w3.org/2002/07/owl#Thing", "ROOT")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  relation_cols_nona["destinationId"] = relation_cols_nona["destinationId"].str.replace("http://www.w3.org/2002/07/owl#Thing", "ROOT")


In [226]:
relation_cols_nona[relation_cols_nona["destinationId"].str.contains("|")].sample(10)

Unnamed: 0,cui,Parents,destinationId
35844,D013804,http://purl.bioontology.org/ontology/MESH/D013...,D013876
144719,D020202,http://purl.bioontology.org/ontology/MESH/D002...,D002543
66935,D007441,http://purl.bioontology.org/ontology/MESH/D007440,D007440
150964,D024801,http://purl.bioontology.org/ontology/MESH/D019636,D019636
11084,D002364,http://purl.bioontology.org/ontology/MESH/D010...,D010750
271190,D012709,http://purl.bioontology.org/ontology/MESH/D000...,D000418
206342,D050765,http://purl.bioontology.org/ontology/MESH/D050682,D050682
127048,D006298,http://www.w3.org/2002/07/owl#Thing|http://pur...,ROOT
21918,D012642,http://purl.bioontology.org/ontology/MESH/D010627,D010627
32581,D000073436,http://purl.bioontology.org/ontology/MESH/D020763,D020763


In [None]:
relation_cols_nona = relation_cols_nona.rename(columns={"cui":"sourceId"})

In [235]:
to_add_concept = pd.read_csv("/home/hwatkins/Desktop/neuroOnto/ontologies/mesh/ontology_concepts.csv", index_col=0)

In [236]:
to_add_concept.sample(10)

Unnamed: 0,cui,concept_name,tui,is_preferred_name,type_name,ontology
13778,D018467,nontherapeutic positive pressure respiration,T047,0,disease or syndrome,MESH
35247,D000987,antisocial behavior,T048,0,mental or behavioral dysfunction,MESH
109940,D050256,antimitotics,T121,0,pharmacologic substance,MESH
189840,D054018,"lasers, arf",T074,0,medical device,MESH
41114,D012244,purple nonsulfur bacteria,T007,0,bacterium,MESH
238903,D017607,aluminum compounds,T197,1,inorganic chemical,MESH
20755,D001004,"disease, anus",T047,0,disease or syndrome,MESH
244040,D002387,cataract extraction,T061,1,therapeutic or preventive procedure,MESH
251596,D007567,"jaundice, neonatal",T047,1,disease or syndrome,MESH
196271,D009624,"noises, transportation",T069,0,environmental effect of humans,MESH


In [237]:
to_add_concept.loc[len(to_add_concept.index)] = ["ROOT", "root concept", "TROOT", "1", "ROOT", "MESH"]

In [245]:
error_codes = ['Q000401', 'Q000639', 'Q000295', 'Q000150', 'Q000188', 'Q000517',
       'Q000032', 'Q000138', 'Q000737', 'Q000493', 'Q000201', 'U000022',
       'Q000145', 'Q000941', 'Q000458', 'Q000556', 'Q000706', 'U000004',
       'Q000503', 'Q000506', 'Q000819', 'Q000187', 'Q000096', 'Q000453',
       'Q000002', 'U000011', 'Q000472', 'U000014', 'U000002', 'Q000208',
       'Q000037', 'Q000134', 'U000021', 'Q000592', 'Q000528', 'Q000166',
       'Q000294', 'Q000008', 'Q000254', 'Q000821', 'Q000382', 'Q000031',
       'Q000601', 'Q000235', 'U000003', 'U000006', 'Q000648', 'Q000523',
       'Q000276', 'Q000191', 'Q000633', 'Q000652', 'Q000009', 'U000009',
       'D005260', 'Q000379', 'Q000662', 'Q000302', 'U000019', 'Q000151',
       'Q000502', 'Q000637', 'U000013', 'Q000628', 'Q000293', 'Q000494',
       'Q000098', 'Q000196', 'U000010', 'Q000534', 'U000001', 'U000020',
       'U000023', 'Q000635', 'Q000193', 'Q000033', 'Q000266', 'U000005',
       'Q000532', 'Q000600', 'Q000139', 'Q000378', 'Q000175',
       'Q000000981', 'Q000172', 'Q000331', 'Q000097', 'Q000473',
       'U000015', 'Q000627', 'U000012', 'Q000451', 'Q000209', 'D008297',
       'Q000178', 'U000008', 'Q000469']

In [246]:
to_add_concept = to_add_concept[~to_add_concept["cui"].isin(error_codes)]

In [247]:
to_add_concept.to_csv("/home/hwatkins/Desktop/neuroOnto/ontologies/mesh/ontology_concepts.csv")

In [227]:
relation_cols_nona.to_csv("/home/hwatkins/Desktop/neuroOnto/ontologies/mesh/ontology_relations.csv")

In [248]:
processed_mesh_path = "/home/hwatkins/Desktop/neuroOnto/ontologies/mesh"
cons = processed_mesh_path + "/ontology_concepts.csv"
rels = processed_mesh_path + "/ontology_relations.csv"

In [249]:
onto = Ontology(cons, rels)

In [251]:
onto.get_root().children

[integumentary system physiological phenomena, cui: D055827,
 immune system phenomena, cui: D055633,
 nervous system diseases, cui: D009422,
 animal diseases, cui: D000820,
 cells, cui: D002477,
 mesh qualifiers, cui: U000018,
 population characteristics, cui: D011154,
 behavioral disciplines and activities, cui: D004191,
 respiratory system, cui: D012137,
 stomatognathic system, cui: D013284,
 circulatory and respiratory physiological phenomena, cui: D002943,
 immune system diseases, cui: D007154,
 infections, cui: D007239,
 neoplasms, cui: D009369,
 genetic phenomena, cui: D055614,
 chemically-induced disorders, cui: D064419,
 polycyclic compounds, cui: D011083,
 pharmaceutical preparations, cui: D004364,
 health care facilities, manpower, and services, cui: D005159,
 eukaryota, cui: D056890,
 tissues, cui: D014024,
 male urogenital diseases, cui: D052801,
 health occupations, cui: D006281,
 geographic locations, cui: D005842,
 ocular physiological phenomena, cui: D009799,
 organic c