# Create custom ontologies specific to corpus
Produce set of terms extracted from reports fro P. to class into hiearchy of terms for a custom neurological dictionary

use the same ontology format as for SNOMED and RADLEX ontologies so that is can be fed into the KnowledgeBase object

- needs both relations and concepts file
- for now use the dictionaries hand-crafted in original matlab work 

# Imports

In [59]:
from tqdm import tqdm
import random
import json
import re
import numpy as np

In [23]:
import pandas as pd

In [24]:
import spacy
from spacy import displacy
from neuroNLP.pipes import create_pipeline
from neuroNLP.linking import KnowledgeBase, CandidateGenerator

# Load Data

In [4]:
FILTERED_SAVE = "/home/hwatkins/Desktop/neuroNLP_assets/data/cleaned_report_data/filtered_reports.csv"
report_df = pd.read_csv(
    FILTERED_SAVE, parse_dates=["study_date", "request_date", "dob", "dod"]
)

In [5]:
report_df["study_date"].min()

Timestamp('1999-02-08 11:17:00')

In [6]:
sample_reports = report_df["report_text"].sample(40000)

# Pipeline

In [7]:
model_dir = "/home/hwatkins/Desktop/neuroNLP/models"

In [8]:
nlp = create_pipeline(
    model_dir, pipes=["tagger", "parser", "word_vectorizer", "sectioner", "custom_ner"]
)

In [9]:
common_anatomy_modifiers = [
    "left",
    "right",
    "anterior",
    "posterior",
    "superior",
    "inferior",
    "medial",
    "lateral",
    "proximal",
    "distal",
    "central",
    "peripheral",
    "upper",
    "lower",
    "dorsal",
    "ventral",
    "horizontal",
    "vertical",
]

# Create regex aliases for concepts

In [25]:
CUSTOM_EDGE_FILE = "/home/hwatkins/Desktop/neuroNLP_assets/data/neuro_dictionaries/custom_coarse_onto/custom_coarse_onto_edges.csv"
CUSTOM_REGEX_FILE = "/home/hwatkins/Desktop/neuroNLP_assets/data/neuro_dictionaries/custom_coarse_onto/total_regex_list.csv"
CUSTOM_ALIAS_FILE = "/home/hwatkins/Desktop/neuroNLP_assets/data/neuro_dictionaries/custom_coarse_onto/custom_coarse_onto_aliases.csv"
REGEX_ALIASES = "/home/hwatkins/Desktop/neuroNLP_assets/data/neuro_dictionaries/custom_coarse_onto/regex_extracted_aliases.csv"

In [26]:
custom_edges = pd.read_csv(CUSTOM_EDGE_FILE)
custom_regex = pd.read_csv(CUSTOM_REGEX_FILE)
custom_aliases =  pd.read_csv(CUSTOM_ALIAS_FILE)
regex_aliases =  pd.read_csv(REGEX_ALIASES)

## Find aliases
for each concept, look through reports to find any possible regex matches

In [62]:
regex_match_dict = {}
for text in tqdm(sample_reports):
    for i, (name, pattern) in enumerate(zip(custom_regex["name"], custom_regex["regex_pattern"])):
        try:
            matches = re.search(pattern, text.lower())
            if matches is not None:
                matches = matches.group()
                if name not in regex_match_dict.keys():
                    regex_match_dict[name] = [matches]
                else:
                    regex_match_dict[name].append(matches)
        except:
            print("error at {}, on name {}".format(i,name))

top_n_dict = {}
for key, val in regex_match_dict.items():
    counts = [i for i in set(val) if val.count(i)>5]
    if counts:
        top_n_dict[key]=counts

  3%|▎         | 1013/40000 [01:51<1:11:19,  9.11it/s]

error at 0, on name inferior frontal gyrus
error at 1, on name superior frontal gyrus
error at 2, on name middle frontal gyrus
error at 3, on name medial frontal gyrus
error at 4, on name frontal gyrus
error at 5, on name inferior transverse frontopolar gyrus
error at 6, on name superior transverse frontopolar gyrus
error at 7, on name middle transverse frontopolar gyrus
error at 8, on name anterior orbital frontal gyrus
error at 9, on name lateral orbital frontal gyrus
error at 10, on name medial orbital frontal gyrus
error at 11, on name posterior orbital frontal gyrus
error at 12, on name orbital frontal gyrus
error at 13, on name anterior transverse temporal gyrus
error at 14, on name posterior transverse temporal gyrus
error at 15, on name transverse temporal gyrus
error at 16, on name inferior temporal gyrus
error at 17, on name middle temporal gyrus
error at 18, on name superior temporal gyrus
error at 19, on name occipital temporal gyrus
error at 20, on name temporal gyrus
erro

 16%|█▋        | 6512/40000 [11:55<1:00:53,  9.16it/s]

error at 0, on name inferior frontal gyrus
error at 1, on name superior frontal gyrus
error at 2, on name middle frontal gyrus
error at 3, on name medial frontal gyrus
error at 4, on name frontal gyrus
error at 5, on name inferior transverse frontopolar gyrus
error at 6, on name superior transverse frontopolar gyrus
error at 7, on name middle transverse frontopolar gyrus
error at 8, on name anterior orbital frontal gyrus
error at 9, on name lateral orbital frontal gyrus
error at 10, on name medial orbital frontal gyrus
error at 11, on name posterior orbital frontal gyrus
error at 12, on name orbital frontal gyrus
error at 13, on name anterior transverse temporal gyrus
error at 14, on name posterior transverse temporal gyrus
error at 15, on name transverse temporal gyrus
error at 16, on name inferior temporal gyrus
error at 17, on name middle temporal gyrus
error at 18, on name superior temporal gyrus
error at 19, on name occipital temporal gyrus
error at 20, on name temporal gyrus
erro

 24%|██▍       | 9553/40000 [17:30<53:28,  9.49it/s]  

error at 0, on name inferior frontal gyrus
error at 1, on name superior frontal gyrus
error at 2, on name middle frontal gyrus
error at 3, on name medial frontal gyrus
error at 4, on name frontal gyrus
error at 5, on name inferior transverse frontopolar gyrus
error at 6, on name superior transverse frontopolar gyrus
error at 7, on name middle transverse frontopolar gyrus
error at 8, on name anterior orbital frontal gyrus
error at 9, on name lateral orbital frontal gyrus
error at 10, on name medial orbital frontal gyrus
error at 11, on name posterior orbital frontal gyrus
error at 12, on name orbital frontal gyrus
error at 13, on name anterior transverse temporal gyrus
error at 14, on name posterior transverse temporal gyrus
error at 15, on name transverse temporal gyrus
error at 16, on name inferior temporal gyrus
error at 17, on name middle temporal gyrus
error at 18, on name superior temporal gyrus
error at 19, on name occipital temporal gyrus
error at 20, on name temporal gyrus
erro

 43%|████▎     | 17255/40000 [31:33<40:18,  9.41it/s] 

error at 0, on name inferior frontal gyrus
error at 1, on name superior frontal gyrus
error at 2, on name middle frontal gyrus
error at 3, on name medial frontal gyrus
error at 4, on name frontal gyrus
error at 5, on name inferior transverse frontopolar gyrus
error at 6, on name superior transverse frontopolar gyrus
error at 7, on name middle transverse frontopolar gyrus
error at 8, on name anterior orbital frontal gyrus
error at 9, on name lateral orbital frontal gyrus
error at 10, on name medial orbital frontal gyrus
error at 11, on name posterior orbital frontal gyrus
error at 12, on name orbital frontal gyrus
error at 13, on name anterior transverse temporal gyrus
error at 14, on name posterior transverse temporal gyrus
error at 15, on name transverse temporal gyrus
error at 16, on name inferior temporal gyrus
error at 17, on name middle temporal gyrus
error at 18, on name superior temporal gyrus
error at 19, on name occipital temporal gyrus
error at 20, on name temporal gyrus
erro

 54%|█████▎    | 21433/40000 [39:13<31:50,  9.72it/s]

error at 0, on name inferior frontal gyrus
error at 1, on name superior frontal gyrus
error at 2, on name middle frontal gyrus
error at 3, on name medial frontal gyrus
error at 4, on name frontal gyrus
error at 5, on name inferior transverse frontopolar gyrus
error at 6, on name superior transverse frontopolar gyrus
error at 7, on name middle transverse frontopolar gyrus
error at 8, on name anterior orbital frontal gyrus
error at 9, on name lateral orbital frontal gyrus
error at 10, on name medial orbital frontal gyrus
error at 11, on name posterior orbital frontal gyrus
error at 12, on name orbital frontal gyrus
error at 13, on name anterior transverse temporal gyrus
error at 14, on name posterior transverse temporal gyrus
error at 15, on name transverse temporal gyrus
error at 16, on name inferior temporal gyrus
error at 17, on name middle temporal gyrus
error at 18, on name superior temporal gyrus
error at 19, on name occipital temporal gyrus
error at 20, on name temporal gyrus
erro

 60%|█████▉    | 23833/40000 [43:37<28:53,  9.33it/s]

error at 0, on name inferior frontal gyrus
error at 1, on name superior frontal gyrus
error at 2, on name middle frontal gyrus
error at 3, on name medial frontal gyrus
error at 4, on name frontal gyrus
error at 5, on name inferior transverse frontopolar gyrus
error at 6, on name superior transverse frontopolar gyrus
error at 7, on name middle transverse frontopolar gyrus
error at 8, on name anterior orbital frontal gyrus
error at 9, on name lateral orbital frontal gyrus
error at 10, on name medial orbital frontal gyrus
error at 11, on name posterior orbital frontal gyrus
error at 12, on name orbital frontal gyrus
error at 13, on name anterior transverse temporal gyrus
error at 14, on name posterior transverse temporal gyrus
error at 15, on name transverse temporal gyrus
error at 16, on name inferior temporal gyrus
error at 17, on name middle temporal gyrus
error at 18, on name superior temporal gyrus
error at 19, on name occipital temporal gyrus
error at 20, on name temporal gyrus
erro

 62%|██████▏   | 24935/40000 [45:38<26:14,  9.57it/s]

error at 0, on name inferior frontal gyrus
error at 1, on name superior frontal gyrus
error at 2, on name middle frontal gyrus
error at 3, on name medial frontal gyrus
error at 4, on name frontal gyrus
error at 5, on name inferior transverse frontopolar gyrus
error at 6, on name superior transverse frontopolar gyrus
error at 7, on name middle transverse frontopolar gyrus
error at 8, on name anterior orbital frontal gyrus
error at 9, on name lateral orbital frontal gyrus
error at 10, on name medial orbital frontal gyrus
error at 11, on name posterior orbital frontal gyrus
error at 12, on name orbital frontal gyrus
error at 13, on name anterior transverse temporal gyrus
error at 14, on name posterior transverse temporal gyrus
error at 15, on name transverse temporal gyrus
error at 16, on name inferior temporal gyrus
error at 17, on name middle temporal gyrus
error at 18, on name superior temporal gyrus
error at 19, on name occipital temporal gyrus
error at 20, on name temporal gyrus
erro

 69%|██████▉   | 27539/40000 [50:25<22:35,  9.20it/s]

error at 0, on name inferior frontal gyrus
error at 1, on name superior frontal gyrus
error at 2, on name middle frontal gyrus
error at 3, on name medial frontal gyrus
error at 4, on name frontal gyrus
error at 5, on name inferior transverse frontopolar gyrus
error at 6, on name superior transverse frontopolar gyrus
error at 7, on name middle transverse frontopolar gyrus
error at 8, on name anterior orbital frontal gyrus
error at 9, on name lateral orbital frontal gyrus
error at 10, on name medial orbital frontal gyrus
error at 11, on name posterior orbital frontal gyrus
error at 12, on name orbital frontal gyrus
error at 13, on name anterior transverse temporal gyrus
error at 14, on name posterior transverse temporal gyrus
error at 15, on name transverse temporal gyrus
error at 16, on name inferior temporal gyrus
error at 17, on name middle temporal gyrus
error at 18, on name superior temporal gyrus
error at 19, on name occipital temporal gyrus
error at 20, on name temporal gyrus
erro

 69%|██████▉   | 27781/40000 [50:51<20:59,  9.70it/s]

error at 0, on name inferior frontal gyrus
error at 1, on name superior frontal gyrus
error at 2, on name middle frontal gyrus
error at 3, on name medial frontal gyrus
error at 4, on name frontal gyrus
error at 5, on name inferior transverse frontopolar gyrus
error at 6, on name superior transverse frontopolar gyrus
error at 7, on name middle transverse frontopolar gyrus
error at 8, on name anterior orbital frontal gyrus
error at 9, on name lateral orbital frontal gyrus
error at 10, on name medial orbital frontal gyrus
error at 11, on name posterior orbital frontal gyrus
error at 12, on name orbital frontal gyrus
error at 13, on name anterior transverse temporal gyrus
error at 14, on name posterior transverse temporal gyrus
error at 15, on name transverse temporal gyrus
error at 16, on name inferior temporal gyrus
error at 17, on name middle temporal gyrus
error at 18, on name superior temporal gyrus
error at 19, on name occipital temporal gyrus
error at 20, on name temporal gyrus
erro

 79%|███████▊  | 31499/40000 [57:39<14:45,  9.60it/s]

error at 0, on name inferior frontal gyrus
error at 1, on name superior frontal gyrus
error at 2, on name middle frontal gyrus
error at 3, on name medial frontal gyrus
error at 4, on name frontal gyrus
error at 5, on name inferior transverse frontopolar gyrus
error at 6, on name superior transverse frontopolar gyrus
error at 7, on name middle transverse frontopolar gyrus
error at 8, on name anterior orbital frontal gyrus
error at 9, on name lateral orbital frontal gyrus
error at 10, on name medial orbital frontal gyrus
error at 11, on name posterior orbital frontal gyrus
error at 12, on name orbital frontal gyrus
error at 13, on name anterior transverse temporal gyrus
error at 14, on name posterior transverse temporal gyrus
error at 15, on name transverse temporal gyrus
error at 16, on name inferior temporal gyrus
error at 17, on name middle temporal gyrus
error at 18, on name superior temporal gyrus
error at 19, on name occipital temporal gyrus
error at 20, on name temporal gyrus
erro

 83%|████████▎ | 33283/40000 [1:00:55<11:10, 10.01it/s]

error at 0, on name inferior frontal gyrus
error at 1, on name superior frontal gyrus
error at 2, on name middle frontal gyrus
error at 3, on name medial frontal gyrus
error at 4, on name frontal gyrus
error at 5, on name inferior transverse frontopolar gyrus
error at 6, on name superior transverse frontopolar gyrus
error at 7, on name middle transverse frontopolar gyrus
error at 8, on name anterior orbital frontal gyrus
error at 9, on name lateral orbital frontal gyrus
error at 10, on name medial orbital frontal gyrus
error at 11, on name posterior orbital frontal gyrus
error at 12, on name orbital frontal gyrus
error at 13, on name anterior transverse temporal gyrus
error at 14, on name posterior transverse temporal gyrus
error at 15, on name transverse temporal gyrus
error at 16, on name inferior temporal gyrus
error at 17, on name middle temporal gyrus
error at 18, on name superior temporal gyrus
error at 19, on name occipital temporal gyrus
error at 20, on name temporal gyrus
erro

 86%|████████▋ | 34533/40000 [1:03:12<09:24,  9.68it/s]

error at 0, on name inferior frontal gyrus
error at 1, on name superior frontal gyrus
error at 2, on name middle frontal gyrus
error at 3, on name medial frontal gyrus
error at 4, on name frontal gyrus
error at 5, on name inferior transverse frontopolar gyrus
error at 6, on name superior transverse frontopolar gyrus
error at 7, on name middle transverse frontopolar gyrus
error at 8, on name anterior orbital frontal gyrus
error at 9, on name lateral orbital frontal gyrus
error at 10, on name medial orbital frontal gyrus
error at 11, on name posterior orbital frontal gyrus
error at 12, on name orbital frontal gyrus
error at 13, on name anterior transverse temporal gyrus
error at 14, on name posterior transverse temporal gyrus
error at 15, on name transverse temporal gyrus
error at 16, on name inferior temporal gyrus
error at 17, on name middle temporal gyrus
error at 18, on name superior temporal gyrus
error at 19, on name occipital temporal gyrus
error at 20, on name temporal gyrus
erro

100%|██████████| 40000/40000 [1:13:12<00:00,  9.11it/s]


In [64]:
regex_aliases = [(key, j) for key, val in top_n_dict.items() for j in val]

In [68]:
pd_regex_aliases = pd.DataFrame(regex_aliases, columns=["name","alias"])

In [15]:
pd_regex_aliases = pd_regex_aliases.drop(columns=["Unnamed: 0"])

In [16]:
pd_regex_aliases.to_csv(REGEX_ALIASES, index=False)

## Align aliases to concepts
some of the aliases match to concepts deeper in the anatomy or pathology tree then currently supported, resolve by moving them up by changing their source

In [27]:
all_aliases = pd.concat([custom_aliases,regex_aliases], ignore_index=True)

In [28]:
not_in_edges = all_aliases[~all_aliases["name"].isin(custom_edges["source"])]
in_edges = all_aliases[all_aliases["name"].isin(custom_edges["source"])]
sub_edges = not_in_edges[not_in_edges["name"].isin(in_edges["alias"])]

In [29]:
parent_names = []
for row in sub_edges.iterrows():
    parent = in_edges[in_edges["alias"]==row[1]["name"]]["name"]
    parent_names.append((parent.values[0], row[1]["alias"]))

In [30]:
upshifted_sub_edges = pd.DataFrame(parent_names, columns=["name", "alias"])

In [31]:
upshifted_sub_edges.head()

Unnamed: 0,name,alias
0,limbic lobe,subiculum
1,limbic lobe,fimbria of hippocampus
2,limbic lobe,hippocampal commissure
3,limbic lobe,alveus
4,limbic lobe,supramammillary decussation


In [32]:
not_sub_edges = not_in_edges[~not_in_edges["name"].isin(in_edges["alias"])]

In [40]:
sub_sub_edges = not_sub_edges[not_sub_edges["name"].isin(upshifted_sub_edges["alias"])]
not_sub_sub_edges = not_sub_edges[~not_sub_edges["name"].isin(upshifted_sub_edges["alias"])]

In [38]:
parent_names = []
for row in sub_sub_edges.iterrows():
    parent = upshifted_sub_edges[upshifted_sub_edges["alias"]==row[1]["name"]]["name"]
    parent_names.append((parent.values[0], row[1]["alias"]))

In [39]:
upshifted_sub_sub_edges = pd.DataFrame(parent_names, columns=["name", "alias"])

In [43]:
not_sub_sub_edges["name"].value_counts()[:30]

obliteration                          5
encasement                            4
hypoxi                                3
cerebral sulci                        3
vermis                                3
syrin                                 3
ring-enhancing                        3
meckel's cave                         3
ectasia                               3
small vessel changes                  3
red nucleus                           2
subcortical white matter              2
posterior limb of internal capsule    2
bone                                  2
dentate nucleus                       2
frontal cortex                        2
cortical sulci                        2
confluence of sinuses                 2
subarachnoid haemorrhage              2
scarring                              2
insula cortex                         2
tentorium cerebelli                   2
occipital cortex                      2
glossotonsillar sulcus                2
cingulate gyrus                       2


In [44]:
all_corrected_aliases = pd.concat([in_edges,upshifted_sub_edges,upshifted_sub_sub_edges], ignore_index=True)

In [48]:
all_corrected_aliases_no_dupes = all_corrected_aliases[~all_corrected_aliases.duplicated()]

In [54]:
all_corrected_aliases_no_dupes = all_corrected_aliases_no_dupes[~all_corrected_aliases_no_dupes["alias"].isin(custom_edges["source"])]

In [57]:
all_corrected_aliases_no_dupes = all_corrected_aliases_no_dupes[~all_corrected_aliases_no_dupes["alias"].duplicated()]

# Create custom ontology

## create concept dictionary

In [72]:
concept_df = pd.DataFrame(data=custom_edges["source"].tolist(), columns=["concept_name"])
concept_df["type_name"] = np.nan
concept_df["tui"] = np.nan

In [74]:
root_concept_df = pd.DataFrame(data=["root concept"], columns=["concept_name"])
root_concept_df["type_name"] = "ROOT"
root_concept_df["tui"] = "T0"

In [75]:
custom_onto_df = pd.concat([concept_df, root_concept_df], ignore_index=True) 

In [76]:
custom_onto_df["cui"] = pd.Series(["C"+str(i) for i in range(len(custom_onto_df))])

In [77]:
custom_onto_df["is_preferred_name"] = 1
custom_onto_df["ontology"] = "CUSTOM-QS-NEURO"

## Create relationship dictionary
encode hierarchical relations between entities`

In [79]:
sources, sinks = custom_edges["sink"].tolist(), custom_edges["source"].tolist()

In [80]:
sourceids = [custom_onto_df[custom_onto_df["concept_name"]==name]["cui"].values[0] for name in  sources]
sinkids = [custom_onto_df[custom_onto_df["concept_name"]==name]["cui"].values[0] for name in  sinks]

In [81]:
custom_rel_df = pd.DataFrame()

In [82]:
custom_rel_df["sourceId"] = sinkids
custom_rel_df["destinationId"] = sourceids

In [83]:
custom_rel_df["id"] = pd.Series(["R"+str(i) for i in range(len(custom_rel_df))])

## add aliases to concept dictionary

In [88]:
alias_cui =[]
alias_name = []
for row in all_corrected_aliases_no_dupes.iterrows():
    name = row[1]["alias"]
    source = row[1]["name"]
    cui = custom_onto_df[custom_onto_df["concept_name"]==source]["cui"].values[0]
    alias_cui.append(cui)
    alias_name.append(name)

In [100]:
alias_df = pd.DataFrame(data=zip(alias_name, alias_cui), columns=["concept_name", "cui"])
alias_df["is_preferred_name"] = 0
alias_df["ontology"] = "CUSTOM-QS-NEURO"
alias_df["type_name"] = np.nan
alias_df["tui"] = np.nan

In [102]:
combined_onto_df = pd.concat([alias_df, custom_onto_df], ignore_index=True)

In [110]:
combined_onto_df.sample(5)

Unnamed: 0,concept_name,cui,is_preferred_name,ontology,type_name,tui
1036,enlarging,C18,0,CUSTOM-QS-NEURO,,
638,high signal intensity,C4,0,CUSTOM-QS-NEURO,,
1399,vertebral artery,C158,1,CUSTOM-QS-NEURO,,
1247,interval change,C6,1,CUSTOM-QS-NEURO,,
837,middle occipital gyrus,C195,0,CUSTOM-QS-NEURO,,


## Find types for concepts

In [142]:
CUSTOM_ONTO_CONC = "/home/hwatkins/Desktop/neuroNLP_assets/data/ontology_data/coarse_custom_ontology.csv"
CUSTOM_ONTO_REL = "/home/hwatkins/Desktop/neuroNLP_assets/data/ontology_data/coarse_custom_ontology_is_a_rels.csv"

In [143]:
combined_onto_df.to_csv(CUSTOM_ONTO_CONC)
custom_rel_df.to_csv(CUSTOM_ONTO_REL)

In [144]:
new_kb = KnowledgeBase(CUSTOM_ONTO_CONC, CUSTOM_ONTO_REL)

In [126]:
types = []
for cui in combined_onto_df["cui"]:
    concept = new_kb[cui]
    depth = concept.depth
    if depth>0:
        for i in range(depth):
            concept = concept.parents[0]
    types.append(concept.name)

In [132]:
type_ids = [typedic[i] for i in types]

In [131]:
typedic = {"root concept":"T0", "pathology":"T1", "anatomy":"T2"}

In [133]:
combined_onto_df["type_name"] = pd.Series(types)
combined_onto_df["tui"] = pd.Series(type_ids)

# Old stuff

In [127]:
UPDATED_CONCEPTS = "/home/hwatkins/Desktop/neuroNLP_assets/data/neuro_dictionaries/new_regex_patterns.json"
UPDATED_ANATOMY_GROUPS = "/home/hwatkins/Desktop/neuroNLP_assets/data/neuro_dictionaries/anatomy_groups.json"
UPDATED_PATHOLOGY_GROUPS = "/home/hwatkins/Desktop/neuroNLP_assets/data/neuro_dictionaries/pathology_groups.json"

In [128]:
with open(UPDATED_CONCEPTS, "r") as file:
    new_concept_pats = json.load(file)
with open(UPDATED_ANATOMY_GROUPS, "r") as file:
    new_anatomy_groups = json.load(file)
with open(UPDATED_PATHOLOGY_GROUPS, "r") as file:
    new_pathology_groups = json.load(file)

In [129]:
new_pathology_groups = ["pathology_concept", new_pathology_groups]
new_anatomy_groups = ["anatomy_concept", new_anatomy_groups]

In [130]:
def get_list_nodes(G, nodes):
    for term in G:
        if type(term)==str:
            nodes.append(term)
        else:
            nodes = get_list_nodes(term, nodes)
    
    return nodes

In [131]:
all_pathology_keys = get_list_nodes(new_pathology_groups, [])
all_anatomy_keys = get_list_nodes(new_anatomy_groups, [])

## get edges from dict tree
encode hierarchical relations between entities`

In [158]:
def get_edges(G, source, edges):
    
    if not G:
        return edges
    
    for key, val in G.items():
        edges.append((source, key))
        edges = get_edges(val, key, edges)
        
    return edges

In [164]:
all_pathology_edges = get_edges(pathology_dict, "root_concept", [])
all_anatomy_edges = get_edges(anatomy_dict, "root_concept", [])

## Alternative, new edges

In [139]:
def get_list_edges(G, source, edges):
    edges.append((source, G[0]))
    for child in G[1]:
        edges.append((G[0], child[0]))
        for subchild in child[1]:
            edges.append((child[0], subchild))
    return edges

In [140]:
all_pathology_edges = get_list_edges(new_pathology_groups, "root_concept", [])
all_anatomy_edges = get_list_edges(new_anatomy_groups, "root_concept", [])

In [132]:
path_onto_df = pd.DataFrame(data=all_pathology_keys, columns=["concept_name"])
path_onto_df["type_name"] = np.nan
path_onto_df["tui"] = np.nan

In [133]:
anat_onto_df = pd.DataFrame(data=all_anatomy_keys, columns=["concept_name"])
anat_onto_df["type_name"] = "anatomy"
anat_onto_df["tui"] = "T2"