# Create custom ontology

create a custom neurological ontology specific to queen square from
- TA2
- MESH
- RADLEX

In [134]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import re
from neuroNLP.ontology import Ontology, CandidateGenerator

In [2]:
radlex_df = pd.read_csv("sources/RADLEX.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
mesh_df = pd.read_csv("sources/MESH.csv")

  interactivity=interactivity, compiler=compiler, result=result)


# TA2 anatomy
## Convert TA2 format
convert TA2 anatomical ontology to an edge list format

In [2]:
ta2_df_raw = pd.read_excel("SenseOrgans_levelled.xlsx")
ta2_df = ta2_df_raw.dropna(how="all",axis=0)
ta2_df = ta2_df.dropna(how="all",axis=1)

In [3]:
ta2_df.columns = ta2_df.iloc[0]
ta2_df = ta2_df.drop(ta2_df.index[0])

In [4]:
level_name = "Level "

In [5]:
ta2_df[level_name][(ta2_df[level_name]==9) & ta2_df["UK English "].str.isupper()] = 3

In [6]:
ta2_df[level_name][(ta2_df[level_name]==9)] = 4

In [7]:
ta2_df[level_name][ta2_df["UK English "].str.startswith("   ")] = 7

In [8]:
ta2_df[level_name][ta2_df["UK English "].str.startswith("      ")] = 8

In [9]:
ta2_df[level_name][(ta2_df[level_name]==10)] = 6

In [10]:
ta2_df[level_name].value_counts()

6    157
7     93
4     30
8     21
3      7
2      5
1      4
0      1
Name: Level , dtype: int64

In [11]:
ta2_df["UK English "] = ta2_df["UK English "].str.lower()
ta2_df["UK English "] = ta2_df["UK English "].str.strip()
ta2_df["UK English "] = ta2_df["UK English "].str.strip("()")
ta2_df["UK English "] = ta2_df["UK English "].str.strip("♂♀")
ta2_df["UK English "] = ta2_df["UK English "].str.strip()

In [12]:
ta2_df.columns

Index(['c', 'UK English ', 'US English ', 'English synonym ', 'Other ',
       'Level '],
      dtype='object', name=5)

In [13]:
ta2_df = ta2_df.reset_index()
ta2_df = ta2_df.rename(columns={4: "id", "c": "ta_cui", "UK English ": "uk_english", "US English ": "us_english", 'English synonym ':"english_synonym",'Other ':"other", level_name:"level"})

In [14]:
ta2_df.to_csv("senseorgs_df_processed.csv", index=False)

## Convert to similar ontology format as radlex and mesh
Find the parent of each concept

In [162]:
cns_df = pd.read_csv("cns_df_processed.csv")
senorg_df = pd.read_csv("senseorgs_df_processed.csv")

In [163]:
def find_parent(x):
    level = x["level"]
    index = x.name
    for idx, row in cns_df[index::-1].iterrows():
        if row["level"]<level:
            return row["ta_cui"]
    return "root"
    

In [164]:
senorg_df["parent_cui"] = [find_parent(row) for idx, row in senorg_df.iterrows()]
cns_df["parent_cui"] = [find_parent(row) for idx, row in cns_df.iterrows()]

In [165]:
cns_df.head()

Unnamed: 0,index,ta_cui,uk_english,us_english,english_synonym,other,level,parent_cui
0,5,5363,nervous system,Nervous system,,Systema nervorum; Neurologia,0,root
1,6,5364,central nervous system,CENTRAL NERVOUS SYSTEM,,Systema nervorum centrale,1,5363
2,7,5365,grey matter,Gray matter,,,6,5364
3,8,5366,white matter,White matter,,,6,5364
4,9,5367,reticular formation,Reticular formation,,,6,5364


In [166]:
cns_df = cns_df.rename(columns={"ta_cui":"source_ontology_id", "uk_english":"concept_name", "parent_cui":"source_ontology_parent", "english_synonym":"synonyms"})

In [167]:
senorg_df = senorg_df.rename(columns={"ta_cui":"source_ontology_id", "uk_english":"concept_name", "parent_cui":"source_ontology_parent", "english_synonym":"synonyms"})

In [168]:
anatomy_df = pd.concat([cns_df,senorg_df])

In [169]:
anatomy_df=anatomy_df.drop(["index","us_english","other","level"],axis=1)

In [170]:
anatomy_df["source_ontology"] = "TA2"

In [171]:
anatomy_df["synonyms"] = anatomy_df["synonyms"].str.replace("; ", "|")

In [175]:
anatomy_df.head()

Unnamed: 0,source_ontology_id,concept_name,synonyms,source_ontology_parent,source_ontology
0,T5363,nervous system,,QSX2,TA2
1,T5364,central nervous system,,T5363,TA2
2,T5365,grey matter,,T5364,TA2
3,T5366,white matter,,T5364,TA2
4,T5367,reticular formation,,T5364,TA2


In [173]:
anatomy_df["source_ontology_id"] = "T" + anatomy_df["source_ontology_id"].astype(str)
anatomy_df["source_ontology_parent"] = "T" + anatomy_df["source_ontology_parent"].astype(str)

In [174]:
anatomy_df.loc[anatomy_df["source_ontology_parent"]=="Troot", "source_ontology_parent"] = "QSX2"

# Pathologies

In [176]:
def contain_string(string, uid):
    split = string.split("|")
    if uid in split:
        return True
    else:
        return False

In [177]:
def find_decendents(uid, df, codes):
    child_rows = df[df["Parents"].apply(lambda x: contain_string(x, uid))]
    child_codes = child_rows["Class ID"].tolist()
    codes.extend(child_codes)
    for child_id in child_codes:
        find_decendents(child_id, df, codes)
        
    

In [178]:
mesh_df_with_parents = mesh_df[~mesh_df["Parents"].isna()]

In [179]:
mesh_url = "http://purl.bioontology.org/ontology/MESH/"

In [180]:
path_dict = {"cardiovascular disease":{"id":["D002318"]}, 
             "toxic":{"id":["D064419"]},
             "traumatic":{"id":["D006259"]},
             "developmental":{"id":["D009358"]},
             "endocrine diseases":{"id":["D004700"]}, 
             "eye diseases":{"id":["D005128"]}, 
             "blood disorders":{"id":["D006425"]},
             "immune disorders":{"id":["D007154"]},
             "connective tissue disorders":{"id":["D003240"]},
             "cns infections":{"id":["D002394"]},
             "cns neoplasms":{"id":["D009423"]},
             "cns diseases":{"id":["D009422"]},
             "cns interventions":{"id":["D019635"]},
             "metabolic":{"id":["D009750"]},
             "therapeutic":{"id":["D013812"]}}

In [181]:
for branch_name in path_dict.keys():
    branch_decendents = []
    for code in path_dict[branch_name]["id"]:
        print("finding code: {}".format(code))
        code_decendents = []
        url = mesh_url + code
        find_decendents(url, mesh_df_with_parents, code_decendents)
        branch_decendents.extend(code_decendents)
    path_dict[branch_name]["decendent_codes"] = branch_decendents

finding code: D002318
finding code: D064419
finding code: D006259
finding code: D009358
finding code: D004700
finding code: D005128
finding code: D006425
finding code: D007154
finding code: D003240
finding code: D002394
finding code: D009423
finding code: D009422
finding code: D019635
finding code: D009750
finding code: D013812


In [182]:
col_names = {"Class ID":"source_ontology_id","Synonyms":"synonyms", "Preferred Label":"concept_name", "Parents":"source_ontology_parent"} 

In [183]:
all_decendent_path_codes = [code for key, val in path_dict.items() for code in val["decendent_codes"]]

In [184]:
decendent_path_items = mesh_df_with_parents[mesh_df_with_parents["Class ID"].isin(all_decendent_path_codes)]

In [185]:
to_drop = list(filter(lambda x: x not in col_names.keys(), decendent_path_items.columns)) 

In [186]:
decendent_path_items = decendent_path_items.rename(columns=col_names)
decendent_path_items = decendent_path_items.drop(to_drop,axis=1)
decendent_path_items["source_ontology"] = "MESH"
decendent_path_items["source_ontology_id"] = decendent_path_items["source_ontology_id"].str.replace(mesh_url, "", regex=False)
decendent_path_items["source_ontology_parent"] = decendent_path_items["source_ontology_parent"].str.replace(mesh_url, "",regex=False)
decendent_path_items["concept_name"] = decendent_path_items["concept_name"].str.lower()
decendent_path_items["synonyms"] = decendent_path_items["synonyms"].str.lower()

In [187]:
decendent_path_items.head()

Unnamed: 0,source_ontology_id,concept_name,synonyms,source_ontology_parent,source_ontology
172,D065707,schizencephaly,"cyst, schizencephalic|cysts, schizencephalic|s...",D065704,MESH
316,D017379,"hypertrophy, left ventricular","hypertrophies, left ventricular|left ventricul...",D006332,MESH
336,D012162,retinal degeneration,"degeneration, retinal|retinal degenerations|de...",D015785|D012164,MESH
460,D007715,klippel-trenaunay-weber syndrome,"angioosteohypertrophy syndromes|syndromes, ang...",D000798,MESH
491,D055676,viscosupplementation,viscosupplementations,D019637|D007270,MESH


# Descriptors

In [188]:
radlex_df_with_parents = radlex_df[~radlex_df["Parents"].isna()]

In [189]:
def find_decendents_radlex(uid, df, codes):
    child_rows = df[df["Parents"]==uid]
    child_codes = child_rows["Class ID"].tolist()
    codes.extend(child_codes)
    for child_id in child_codes:
        find_decendents_radlex(child_id, df, codes)

In [190]:
radlex_url = "http://radlex.org/RID/"

In [191]:
descr_dict = {"workflow":{"id":["RID45812"]},
              "foreign object":{"id":["RID34861"]}, 
              "imaging procedure":{"id":["RID13060"]}, 
              "interventional imaging":{"id":["RID11005"]},
              "treatment":{"id":["RID8"]},
              "signs":{"id":["RID29023"]}, 
              "confidence":{"id":["RID29"]}, 
              "image quality":{"id":["RID10","RID39077"]}, 
              "image procedure properties":{"id":["RID10638"]}, 
              "quantity":{"id":["RID5761"]},
              "size":{"id":["RID5772"]},
              "distribution":{"id":["RID5958"]}, 
              "motion":{"id":["RID5921","RID34327"]},
              "extent":{"id":["RID5683"]},
              "temporal":{"id":["RID5716"]},
              "healing":{"id":["RID6351"]},
              "aetiological":{"id":["RID5657"]},
              "disruptiveness":{"id":["RID5675"]},
              "contrast":{"id":["RID34300","RID6058"]},
              "composition":{"id":["RID5738"]},
              "morphology":{"id":["RID5863","RID38770"]},
              "signal":{"id":["RID6049"]},
              "lesional":{"id":["RID43356","RID5972"]},
              "aggressiveness":{"id":["RID5675"]},
              "orientation location":{"id":["RID5817","RID5851"]}}

In [192]:
for branch_name in descr_dict.keys():
    branch_decendents = []
    for code in descr_dict[branch_name]["id"]:
        print("finding code: {}".format(code))
        code_decendents = []
        url = radlex_url + code
        find_decendents_radlex(url, radlex_df_with_parents, code_decendents)
        branch_decendents.extend(code_decendents)
    descr_dict[branch_name]["decendent_codes"] = branch_decendents

finding code: RID45812
finding code: RID34861
finding code: RID13060
finding code: RID11005
finding code: RID8
finding code: RID29023
finding code: RID29
finding code: RID10
finding code: RID39077
finding code: RID10638
finding code: RID5761
finding code: RID5772
finding code: RID5958
finding code: RID5921
finding code: RID34327
finding code: RID5683
finding code: RID5716
finding code: RID6351
finding code: RID5657
finding code: RID5675
finding code: RID34300
finding code: RID6058
finding code: RID5738
finding code: RID5863
finding code: RID38770
finding code: RID6049
finding code: RID43356
finding code: RID5972
finding code: RID5675
finding code: RID5817
finding code: RID5851


In [193]:
all_decendent_descr_codes = [code for key, val in descr_dict.items() for code in val["decendent_codes"]]

In [194]:
decendent_descr_items = radlex_df_with_parents[radlex_df_with_parents["Class ID"].isin(all_decendent_descr_codes)]

## Convert to format

In [195]:
to_drop = list(filter(lambda x: x not in col_names.keys(), decendent_descr_items.columns)) 

In [196]:
decendent_descr_items = decendent_descr_items.rename(columns=col_names)
decendent_descr_items = decendent_descr_items.drop(to_drop,axis=1)
decendent_descr_items["source_ontology"] = "RADLEX"
decendent_descr_items["source_ontology_id"] = decendent_descr_items["source_ontology_id"].str.replace(radlex_url, "", regex=False)
decendent_descr_items["source_ontology_parent"] = decendent_descr_items["source_ontology_parent"].str.replace(radlex_url, "",regex=False)
decendent_descr_items["concept_name"] = decendent_descr_items["concept_name"].str.lower()
decendent_descr_items["synonyms"] = decendent_descr_items["synonyms"].str.lower()

In [197]:
decendent_descr_items[~decendent_descr_items["synonyms"].isna()].sample(10)

Unnamed: 0,source_ontology_id,concept_name,synonyms,source_ontology_parent,source_ontology
21267,RID45924,final report approved,reported,RID45812,RADLEX
26196,RID3874,mass,area of enhancement|focus|density|lesion|nodul...,RID38780,RADLEX
42383,RID35176,dripping candle wax sign,dripping candle sign|dripping candlewax sign|f...,RID29023,RADLEX
28324,RID39225,nonevaluable,not evaluable,RID39077,RADLEX
45447,RID45959,prepworknotcomplete,prep work not complete,RID45953,RADLEX
46676,RID45992,risdown,ris down,RID45985,RADLEX
7591,RID45951,tracerorder,tracer order,RID45812,RADLEX
35609,RID34404,mickey mouse sign of liver,mickey mouse sign,RID29023,RADLEX
13584,RID45927,reqlabsreviewed,lab tests reviewed|required labs reviewed|requ...,RID45812,RADLEX
46602,RID10767,radial k-space trajectory,projection reconstruction|projection acquisition,RID10766,RADLEX


# Combine all three components

In [198]:
total_decendent_onto = pd.concat([decendent_descr_items, decendent_path_items, anatomy_df])

In [199]:
total_decendent_onto.sample(5)

Unnamed: 0,source_ontology_id,concept_name,synonyms,source_ontology_parent,source_ontology
664,T6027,central reticular nucleus,,T6026,TA2
9584,RID38662,innumerable,,RID5765,RADLEX
24979,RID35452,polka-dot sign,polka-dot pattern,RID29023,RADLEX
132,T5495,middle temporal gyrus,,T5488,TA2
13345,D055091,bronchomalacia,bronchi chondromalacia|chondromalacia of bronc...,D055089,MESH


## Add custom top layers

In [200]:

custom_layer = {"source_ontology_id": ["QSX0", "QSX1", "QSX2","QSX3"], 
                "concept_name":["root_entity", "pathology", "anatomy","descriptor"],
                "synonyms":["","","",""],
                "source_ontology_parent":["","QSX0","QSX0","QSX0"],
                "source_ontology":["QSXNEURO", "QSXNEURO","QSXNEURO","QSXNEURO"]}

custom_df = pd.DataFrame(data=custom_layer)

In [201]:
top_mesh_layer = {"source_ontology_id": ["QSX"+str(i+4) for i in range(15)], 
                "concept_name":["cardiovascular disease",
                                "toxic",
                                "traumatic",
                                "developmental",
                                "endocrine diseases",
                                "eye diseases",
                                "blood disorders",
                                "immune disorders",
                                "connective tissue disorders",
                                "cns infections",
                                "cns neoplasms",
                                "cns diseases",
                                "cns interventions",
                                "metabolic",
                                "therapeutic"
                               ],
                "synonyms":["" for i in range(15)],
                "source_ontology_parent":["QSX1" for i in range(15)],
                "source_ontology":["QSXNEURO" for i in range(15)]}

top_radlex_layer = {"source_ontology_id": ["QSX"+str(i+19) for i in range(36)], 
                "concept_name":["workflow",
                                "foreign object",
                                "imaging procedure",
                                "interventional imaging",
                                "treatment",
                                "signs",
                                "confidence",
                                "image quality",
                                "image procedure properties",
                                "quantity",
                                "size",
                                "distribution",
                                "motion",
                                "extent",
                                "temporal",
                                "healing",
                                "aetiological",
                                "disruptiveness",
                                "contrast",
                                "composition",
                                "morphology",
                                "signal",
                                "lesional",
                                "aggressiveness",
                                "orientation location",
                                "administrative descriptor",
                                "extrinsic descriptor",
                                "interventional descriptor",
                                "meta-descriptor",
                                "interpretive",
                                "instrumental",
                                "biological descriptor",
                                "natural descriptor",
                                "absolute metric",
                                "relative metric",
                                "intrinsic characteristic",
                               ],
                "synonyms":["" for i in range(36)],
                "source_ontology_parent":["QSX41",
                                          "QSX45",
                                          "QSX3",
                                          "QSX46",
                                          "QSX46",
                                          "QSX47",
                                          "QSX47",
                                          "QSX49",
                                          "QSX49",
                                          "QSX52",
                                          "QSX52",
                                          "QSX52",
                                          "QSX52",
                                          "QSX48",
                                          "QSX48",
                                          "QSX48",
                                          "QSX48",
                                          "QSX48",
                                          "QSX54",
                                          "QSX54",
                                          "QSX54",
                                          "QSX54",
                                          "QSX54",
                                          "QSX53",
                                          "QSX53",
                                          "QSX3",
                                          "QSX3",
                                          "QSX50",
                                          "QSX51",
                                          "QSX51",
                                          "QSX3",
                                          "QSX3",
                                          "QSX50",
                                          "QSX51",
                                          "QSX51",
                                          "QSX51"
                                         ],
                "source_ontology":["QSXNEURO" for i in range(36)]}

In [202]:
top_mesh_df = pd.DataFrame(data=top_mesh_layer)
top_radlex_df = pd.DataFrame(data=top_radlex_layer)

In [203]:
total_onto = pd.concat([total_decendent_onto,top_radlex_df,top_mesh_df,custom_df])

In [204]:
total_onto["cui"] = ["QS" + str(i) for i in range(len(total_onto))]
total_onto["ontology"] = "QSNEURO"
total_onto["is_preferred_name"] = 1

## drop duplicate names

In [205]:
total_onto = total_onto.drop(total_onto[(total_onto["concept_name"].duplicated(keep=False)) & (total_onto["source_ontology"]=="RADLEX")].index)

In [206]:
total_onto["concept_name"].duplicated().value_counts()

False    7959
Name: concept_name, dtype: int64

## Split up synonyms

In [207]:
total_onto["synonyms"] = total_onto["synonyms"].str.lower()

In [208]:
total_onto.loc[total_onto["synonyms"].isna(), "synonyms"] = "" #total_onto.loc[total_onto["synonyms"].isna(),"concept_name"]

In [209]:
total_onto.loc[total_onto["synonyms"].str.isspace(), "synonyms"] = "" #total_onto.loc[total_onto["synonyms"].str.isspace(),"concept_name"]

In [210]:
total_onto.loc[total_onto["synonyms"]=="", "synonyms"] = total_onto.loc[total_onto["synonyms"]=="", "concept_name"]

In [211]:
total_onto.loc[total_onto["synonyms"]!=total_onto["concept_name"], "synonyms"] = total_onto.loc[total_onto["synonyms"]!=total_onto["concept_name"], "concept_name"]+"|"+ total_onto.loc[total_onto["synonyms"]!=total_onto["concept_name"], "synonyms"]

In [212]:
total_onto["synonyms"] = total_onto["synonyms"].str.split("|")

In [213]:
total_onto = total_onto.explode("synonyms")

In [214]:
total_onto["is_preferred_name"] = total_onto["concept_name"] == total_onto["synonyms"] 

In [215]:
total_onto = total_onto.rename(columns={"concept_name":"preferred_name", "synonyms":"concept_name"})

In [216]:
total_onto.sample(10)

Unnamed: 0,source_ontology_id,preferred_name,concept_name,source_ontology_parent,source_ontology,cui,ontology,is_preferred_name
287143,D052496,"lipodystrophy, familial partial","lipodystrophy, familial, of limbs and lower trunk",D000083083|D008060,MESH,QS6127,QSNEURO,False
133167,D020216,carotid-cavernous sinus fistula,traumatic carotid-cavernous sinus fistula,D001164|D002340|D020212,MESH,QS4243,QSNEURO,False
39038,RID45831,attemptedcritnotify,critical event notify begun,RID45812,RADLEX,QS2210,QSNEURO,False
232863,D003117,color vision defects,color vision defect,D000077765|D014786,MESH,QS5480,QSNEURO,False
28300,D013071,"speech, alaryngeal","productions, alaryngeal voice",D012049,MESH,QS2981,QSNEURO,False
112109,D049310,distal myopathies,udd-markesbery muscular dystrophy,D009136,MESH,QS3982,QSNEURO,False
139365,D009187,myelitis,subacute necrotizing myelitis,D013118|D002494,MESH,QS4336,QSNEURO,False
14875,D020222,abducens nerve injury,abducens nerve traumas,D020434|D020209,MESH,QS2802,QSNEURO,False
53919,D056147,equine-assisted therapy,"riding therapies, horseback",D056447,MESH,QS3285,QSNEURO,False
130059,D015840,oculomotor nerve diseases,partial third nerve palsy,D015835,MESH,QS4201,QSNEURO,False


## create relations list

In [217]:
edge_list_old_codes = []
for idx, concept in tqdm(total_onto.iterrows()):
    source_cui = concept["cui"]
    source_onto_parents = str(concept["source_ontology_parent"])
    split_parents = source_onto_parents.split("|")
    for parent_code in split_parents:
        edge_list_old_codes.append((source_cui, parent_code))

52567it [00:05, 9694.92it/s] 


In [218]:
descr_mapping = {}
for key, val in descr_dict.items():
    idx = top_radlex_layer["concept_name"].index(key)
    new_key = top_radlex_layer["source_ontology_id"][idx]
    descr_mapping[key] = {"old_keys":val["id"], "new_key":new_key}

In [219]:
path_mapping = {}
for key, val in path_dict.items():
    idx = top_mesh_layer["concept_name"].index(key)
    new_key = top_mesh_layer["source_ontology_id"][idx]
    path_mapping[key] = {"old_keys":val["id"], "new_key":new_key}

In [220]:
pd_mapping = {}
pd_mapping.update(path_mapping)
pd_mapping.update(descr_mapping)
total_mapping = {old_key:val["new_key"] for val in pd_mapping.values() for old_key in val["old_keys"]}

## Correct the top level codes

In [221]:
edge_list_qx_codes = []
for source, sink in tqdm(edge_list_old_codes):
    new_sink = sink
    if sink in total_mapping.keys():
        new_sink = total_mapping[sink]
    edge_list_qx_codes.append((source, new_sink))

100%|██████████| 86809/86809 [00:00<00:00, 796531.56it/s]


In [222]:
sources, sinks = zip(*edge_list_qx_codes)
relations = pd.DataFrame(data={"sourceId":sources, "destinationId":sinks})

In [99]:
edgedict = total_onto[["source_ontology_id", "cui"]].to_dict()
replacedict = {str(val):edgedict["cui"][key] for key, val in edgedict["source_ontology_id"].items()}

In [230]:
all_sources = total_onto["source_ontology_id"].tolist()
final_edge_list = []
for source, sink in tqdm(edge_list_qx_codes):
    if sink in all_sources:
        new_sink = total_onto[total_onto["source_ontology_id"]==sink]["cui"].values[0]
        final_edge_list.append((source, new_sink))

100%|██████████| 86809/86809 [05:18<00:00, 272.84it/s]


In [231]:
final_edge_list_unique = list(set(final_edge_list))

In [232]:
sources, sinks = zip(*final_edge_list_unique)
relations_df = pd.DataFrame(data={"sourceId":sources, "destinationId":sinks})

In [244]:
relations_df.sample(10)

Unnamed: 0,sourceId,destinationId
9210,QS5984,QS4881
3136,QS3099,QS4105
1753,QS7056,QS7055
2537,QS5533,QS5359
5431,QS1084,QS185
1187,QS4476,QS4881
1004,QS4744,QS5903
2069,QS7139,QS7137
8271,QS2815,QS5427
2547,QS5115,QS4600


In [245]:
total_onto.to_csv("qs_ontology_concepts.csv", index=False)

In [246]:
relations_df.to_csv("qs_ontology_relations.csv", index=False)

In [247]:
len(total_onto)

52567

# Remove parentheic parts from names 

In [39]:
concept_onto = "/home/hwatkins/Desktop/neuroNLP_ontologies/ontologies/qs_ontology_concepts.csv"

In [40]:
concept_df = pd.read_csv(concept_onto)

In [41]:
concept_df.head()

Unnamed: 0,source_ontology_id,preferred_name,concept_name,source_ontology_parent,source_ontology,cui,ontology,is_preferred_name
0,RID10897,curved array transducer,curved array transducer,RID10867,RADLEX,QS0,QSNEURO,1
1,RID10763,elliptical centric k-space trajectory,elliptical centric k-space trajectory,RID10758,RADLEX,QS1,QSNEURO,1
2,RID35530,shaggy heart sign,shaggy heart sign,RID29023,RADLEX,QS2,QSNEURO,1
3,RID39435,arterial phase hypoenhancement,arterial phase hypoenhancement,RID43354,RADLEX,QS3,QSNEURO,1
4,RID39435,arterial phase hypoenhancement,arterial phase hypo-enhancement,RID43354,RADLEX,QS3,QSNEURO,0


In [42]:
concept_df[concept_df["concept_name"].str.contains("(", regex=False)].head()

Unnamed: 0,source_ontology_id,preferred_name,concept_name,source_ontology_parent,source_ontology,cui,ontology,is_preferred_name
40,RID35255,half moon sign (shoulder),half moon sign (shoulder),RID29023,RADLEX,QS25,QSNEURO,1
458,RID49889,comet-tail artifact (small),comet-tail artifact (small),RID49887,RADLEX,QS319,QSNEURO,1
591,RID50580,early washout (ceus),early washout (ceus),RID50579,RADLEX,QS402,QSNEURO,1
1206,RID50581,late washout (ceus),late washout (ceus),RID50579,RADLEX,QS841,QSNEURO,1
1243,RID50579,washout (ceus),washout (ceus),RID43354,RADLEX,QS867,QSNEURO,1


In [45]:
concept_df.iloc[40]

source_ontology_id              RID35255
preferred_name            half moon sign
concept_name              half moon sign
source_ontology_parent          RID29023
source_ontology                   RADLEX
cui                                 QS25
ontology                         QSNEURO
is_preferred_name                      1
Name: 40, dtype: object

In [44]:
concept_df["concept_name"] = concept_df["concept_name"].str.replace(r"\(.*\)","",regex=True)
concept_df["preferred_name"] = concept_df["preferred_name"].str.replace(r"\(.*\)","",regex=True)
concept_df["preferred_name"] = concept_df["preferred_name"].str.strip()
concept_df["concept_name"] = concept_df["concept_name"].str.strip()

In [95]:
concept_df = concept_df.drop(concept_df[concept_df["concept_name"].str.len()<1].index)

In [46]:
concept_df.to_csv(concept_onto, index=False)

# Filter ontology
filter ontology for term that exist within the corpus of texts

In [85]:
REPORT_CSV = "/home/hwatkins/Desktop/neuroNLP_assets/data/cleaned_report_data/filtered_reports.csv"

In [129]:
report_corpus = pd.read_csv(REPORT_CSV,dtype=object)
report_corpus["report_text"] = report_corpus["report_text"].apply(str)

In [88]:
onto_file = "/home/hwatkins/Desktop/neuroNLP_ontologies/ontologies/qs_ontology_concepts.csv"
rel_file = "/home/hwatkins/Desktop/neuroNLP_ontologies/ontologies/qs_ontology_relations.csv"
onto = Ontology(onto_file, rel_file)

In [89]:
cgen = CandidateGenerator(onto)

In [118]:
cands = cgen.get_candidates(["circle of willis"])

In [119]:
for cand in cands[0]:
    print(onto[cand])

restless legs syndrome, cui: QS5163
cerebral palsy, cui: QS3546
hereditary sensory and motor neuropathy, cui: QS4305
lower body negative pressure, cui: QS4249
t1 weighted, cui: QS58


In [139]:
mega_string = " ".join(report_corpus["report_text"].sample(150000).to_list()).lower()

In [140]:
noted_codes = []
for idx, row in tqdm(concept_df.iterrows(), total=len(concept_df)):
    cui = row["cui"]
    name = row["concept_name"]
    if cui not in noted_codes:
        if name in mega_string:
            noted_codes.append(cui)

100%|██████████| 52554/52554 [34:27<00:00, 25.42it/s]  


In [141]:
len(noted_codes)

2597

In [143]:
len(concept_df["cui"].unique())

7948

In [149]:
additional_codes = []
for code in tqdm(noted_codes):
    stack = [code]
    while stack:
        branch_code = stack.pop(0)
        node = onto[branch_code]
        parent_codes = [p.cui for p in node.parents]
        additional_codes.extend(parent_codes)
        stack.extend(parent_codes)
        

100%|██████████| 2597/2597 [00:20<00:00, 128.10it/s]


In [151]:
len(set(additional_codes))

1090

In [153]:
total_codes = set(noted_codes+additional_codes)

In [154]:
len(total_codes)

3067

In [155]:
onto.get_root()

root_entity, cui: QS7989

In [158]:
filtered_concepts = concept_df[concept_df["cui"].isin(total_codes)]

In [161]:
filtered_concepts.to_csv("/home/hwatkins/Desktop/neuroNLP_ontologies/ontologies/qs_ontology_filtered_concepts.csv", index=False)
