In [3]:
import pandas as pd 
import numpy as np

In [129]:
def set_term(column1,column2):
    if not pd.isnull(column1):
        return column1
    if not pd.isnull(column2):
        return column2
    else:
        return np.nan

In [130]:
def set_term_list(column1,column2):
    if isinstance(column1,list):
        return column1
    if isinstance(column2,list):
        return column2
    else:
        return np.nan

In [131]:
def separate_id(column,idx):
    for i in column:
        if idx in i:
            column.remove(i)
            if idx == 'MESH' or idx == 'UMLS_CUI':
                return i.split(':')[-1]
            return i
    return np.nan

# Mesh Diseases

In [132]:
mesh_df = pd.read_csv('data/MESH_with_Semantic_types.csv')
mesh_df

Unnamed: 0,Class ID,Preferred Label,Synonyms,CUI,type
0,C000624633,technetium 99m hydroxyethylene-diphosphonate,99mTc-HDP|99mTc-hydroxyethylene-diphosphonate,C0305030,Inorganic Chemical
1,D014638,Vanadates,Oxyvanadium|Sodium Vanadate|Vanadyl|Decavanada...,C0030103|C0042308|C0037568|C0029366|C0042305|C...,Inorganic Chemical
2,C031895,lead chromate,"lead chromate, Pb(2+) (1:1)|lead chromate yellow",C0064717|C0892125,Inorganic Chemical
3,C403922,bioactive glass S53P4,BAG S53P4|BAG-S53P4,C0912006,Inorganic Chemical
4,C494370,"calcium phosphate, monobasic",calcium superphosphate|CaH4O8P2|monocalcium or...,C1527438,Inorganic Chemical
...,...,...,...,...,...
87074,D007231,"Infant, Newborn","Newborn|Infants, Newborn|Newborn Infants|Neona...",C0021289,Age Group
87075,D015394,Molecular Structure,"Structures, Molecular|Molecular Structures|Str...",C0026383,Molecular Sequence
87076,D005815,Genetic Code,"Code, Genetic|Genetic Codes|Codes, Genetic",C0017380,Molecular Sequence
87077,D020296,"Animals, Congenic","Animals, Coisogenic|Animal, Coisogenic|Coisoge...",C0600529,Vertebrate


## Obtain just diseases or related Semantic types in Mesh

In [133]:
disease_types = ['Indicator, Reagent, or Diagnostic Aid',
 'Virus',
 'Bacterium',
 'Disease or Syndrome',
 'Diagnostic Procedure',
 'Congenital Abnormality',
 'Therapeutic or Preventive Procedure',
 'Pathologic Function',
 'Health Care Activity',
 'Injury or Poisoning',
 'Finding',
 'Neoplastic Process',
 'Mental or Behavioral Dysfunction',
 'Organ or Tissue Function',
 'Anatomical Abnormality',
 'Cell Function',
 'Genetic Function',
 'Phenomenon or Process',
 'Physiologic Function',
 'Sign or Symptom',
 'Mental Process',
 'Cell or Molecular Dysfunction',
 'Acquired Abnormality',
 'Experimental Model of Disease',
 'Biologic Function',
 'Behavior',
 ]

In [134]:
disease_mesh_df = mesh_df[mesh_df.type.isin(disease_types)]
disease_mesh_df

Unnamed: 0,Class ID,Preferred Label,Synonyms,CUI,type
25512,C000623720,Autographa californica multiple nuclear polyhe...,Trichoplusia ni multiple nucleopolyhedrovirus|...,C1629281,Virus
25513,C000624051,Tobacco necrosis virus A,,C1187077,Virus
25514,D000071819,Virophages,Virophage,C4277719,Virus
25515,D018059,Birnaviridae,,C0206505,Virus
25516,C000624066,Sunflower chlorotic mottle virus,,C1062582,Virus
...,...,...,...,...,...
87035,D055809,Illness Behavior,"Behaviors, Sickness|Behavior, Sickness|Behavio...",C2350572,Behavior
87061,D004195,"Disease Models, Animal","Disease Model, Animal|Animal Disease Models|An...",C0012644,Experimental Model of Disease
87062,D004681,"Encephalomyelitis, Autoimmune, Experimental","Allergic Encephalomyelitis, Experimental|Exper...",C0014072,Experimental Model of Disease
87063,D011014,Pneumonia,"Pneumonia, Lobar|Lobar Pneumonias|Lung Inflamm...",C0032300|C0032285|C3714636|C0887898,Experimental Model of Disease


In [135]:
print(disease_mesh_df.shape)

(10681, 5)


In [136]:
disease_mesh_df.rename(columns={'Class ID':'MeshID'},inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


# CTD

In [137]:
important_columns_ctd = {
    'DiseaseName',
    'DiseaseID',
    'AltDiseaseIDs',
    'Synonyms',
    'SlimMappings'
}

In [138]:
ctd_disease_df = pd.read_csv(
    "data/Disease/CTD_diseases.csv",      # relative python path to subdirectory
    sep=',',
    usecols=important_columns_ctd,
    dtype=str
)

print(ctd_disease_df.shape)
ctd_disease_df.head()

(13102, 5)


Unnamed: 0,DiseaseName,DiseaseID,AltDiseaseIDs,Synonyms,SlimMappings
0,10p Deletion Syndrome (Partial),MESH:C538288,,"Chromosome 10, 10p- Partial|Chromosome 10, mon...",Congenital abnormality|Genetic disease (inborn...
1,13q deletion syndrome,MESH:C535484,,Chromosome 13q deletion|Chromosome 13q deletio...,Congenital abnormality|Genetic disease (inborn...
2,15q24 Microdeletion,MESH:C579849,DO:DOID:0060395,15q24 Deletion|15q24 Microdeletion Syndrome|In...,Congenital abnormality|Genetic disease (inborn...
3,16p11.2 Deletion Syndrome,MESH:C579850,,,Congenital abnormality|Genetic disease (inborn...
4,"17,20-Lyase Deficiency, Isolated",MESH:C567076,,"17-Alpha-Hydroxylase-17,20-Lyase Deficiency, C...",Congenital abnormality|Endocrine system diseas...


## Split mixed IDs in Mesh ID and others

In [139]:
ctd_disease_df['MeshID'] = ctd_disease_df['DiseaseID'].apply(lambda x: x if x.startswith('MESH') else '')

In [140]:
ctd_disease_df['Cross_reference'] = ctd_disease_df['DiseaseID'].apply(lambda x: x if not x.startswith('MESH') else '')

In [141]:
ctd_disease_df['MeshID'] = ctd_disease_df['MeshID'].apply(lambda x: x.split(':')[-1])

In [142]:
ctd_disease_df['AltDiseaseIDs'] = ctd_disease_df['AltDiseaseIDs'].apply(lambda x: [f for f in x.split('|')] if isinstance(x,str) else np.nan)

In [143]:
ctd_disease_df['Cross_reference'] = ctd_disease_df['Cross_reference'].replace('',np.nan)

In [144]:
ctd_disease_df['Cross_reference'] = ctd_disease_df['Cross_reference'].apply(lambda x: [x] if isinstance(x,str) else [])

In [145]:
ctd_disease_df['Cross_reference'] = ctd_disease_df['Cross_reference'] + ctd_disease_df['AltDiseaseIDs']

In [146]:
ctd_disease_df.drop(['DiseaseID','AltDiseaseIDs'], axis=1, inplace=True)

In [147]:
ctd_disease_df

Unnamed: 0,DiseaseName,Synonyms,SlimMappings,MeshID,Cross_reference
0,10p Deletion Syndrome (Partial),"Chromosome 10, 10p- Partial|Chromosome 10, mon...",Congenital abnormality|Genetic disease (inborn...,C538288,
1,13q deletion syndrome,Chromosome 13q deletion|Chromosome 13q deletio...,Congenital abnormality|Genetic disease (inborn...,C535484,
2,15q24 Microdeletion,15q24 Deletion|15q24 Microdeletion Syndrome|In...,Congenital abnormality|Genetic disease (inborn...,C579849,[DO:DOID:0060395]
3,16p11.2 Deletion Syndrome,,Congenital abnormality|Genetic disease (inborn...,C579850,
4,"17,20-Lyase Deficiency, Isolated","17-Alpha-Hydroxylase-17,20-Lyase Deficiency, C...",Congenital abnormality|Endocrine system diseas...,C567076,
...,...,...,...,...,...
13097,Zunich neuroectodermal syndrome,"CHIME|CHIME syndrome|COLOBOMA, CONGENITAL HEAR...",Cardiovascular disease|Congenital abnormality|...,C536729,"[DO:DOID:0112152, OMIM:280000]"
13098,Zuska's Disease,Lactation and squamous metaplasia of lactifero...,Pathology (anatomical condition)|Pathology (pr...,C536730,
13099,Zygodactyly 1,ZD1,Congenital abnormality|Musculoskeletal disease,C565223,[OMIM:609815]
13100,Zygomatic Fractures,"Fractures, Zygomatic|Fracture, Zygomatic|Zygom...",Nervous system disease|Wounds and injuries,D015051,


# Merge MESH and CTD vocabs

In [148]:
df_merge = disease_mesh_df.merge(ctd_disease_df,on='MeshID',how='outer')

In [149]:
df_merge.sample(10)

Unnamed: 0,MeshID,Preferred Label,Synonyms_x,CUI,type,DiseaseName,Synonyms_y,SlimMappings,Cross_reference
8862,C564484,"Holoprosencephaly, Ectrodactyly, and Bilateral...",Hartsfield Syndrome,C1845146,Congenital Abnormality,"Holoprosencephaly, Ectrodactyly, and Bilateral...",Hartsfield Syndrome,Congenital abnormality|Genetic disease (inborn...,
4430,C000644507,Novosphingobium malaysiense,,C3907324,Bacterium,,,,
8409,C015011,azidonitrophenylaminobutyryl-ADP,azidonitrophenylaminobutyryl-adenosine diphosp...,C0052782,"Indicator, Reagent, or Diagnostic Aid",,,,
18010,C536052,,,,,Osteolysis syndrome recessive,"Osteolysis, distal, with short stature, mental...",Mental disorder|Musculoskeletal disease|Nervou...,
4935,C000647614,Actinoplanes atraurantiacus,,C3557927,Bacterium,,,,
13080,D001478,,,,,Basal Cell Nevus Syndrome,BCNS|Fifth Phacomatoses|Fifth Phacomatosis|Gor...,Cancer|Congenital abnormality|Genetic disease ...,"[DO:DOID:2512, OMIM:109400]"
2766,C000649083,Aliidiomarina soli,,C4443217,Bacterium,,,,
16945,D008312,,,,,"Malocclusion, Angle Class II","Angle Class II|Angle Class II, Division 1|Angl...",Mouth disease,
14186,C564509,,,,,"Corpus Callosum, Agenesis of, with Mental Reta...","MENTAL RETARDATION, X-LINKED, SYNDROMIC 28|MRXS28",Congenital abnormality|Eye disease|Mental diso...,[OMIM:300472]
1832,C000646641,Kibdelosporangium aridum subsp. aridum,,C5227030,Bacterium,,,,


## Clean resultant DataFrame

In [150]:
df_merge['Term'] = df_merge.apply(lambda x: set_term(x['Preferred Label'],x.DiseaseName),axis=1)

In [151]:
df_merge.drop(['DiseaseName','Preferred Label'],axis=1, inplace=True)

In [152]:
df_merge['Synonyms'] = df_merge.apply(lambda x: set_term(x.Synonyms_x,x.Synonyms_y),axis=1)

In [153]:
df_merge.drop(['Synonyms_x','Synonyms_y'],axis=1, inplace=True)

In [154]:
df_merge['type'] = df_merge.apply(lambda x: set_term(x.SlimMappings,x.type),axis=1)

In [155]:
df_merge.drop(['SlimMappings'],axis=1, inplace=True)

In [156]:
df_merge

Unnamed: 0,MeshID,CUI,type,Cross_reference,Term,Synonyms
0,C000623720,C1629281,Virus,,Autographa californica multiple nuclear polyhe...,Trichoplusia ni multiple nucleopolyhedrovirus|...
1,C000624051,C1187077,Virus,,Tobacco necrosis virus A,
2,D000071819,C4277719,Virus,,Virophages,Virophage
3,D018059,C0206505,Virus,,Birnaviridae,
4,C000624066,C1062582,Virus,,Sunflower chlorotic mottle virus,
...,...,...,...,...,...,...
20430,C536728,,Congenital abnormality|Musculoskeletal disease...,,Zori Stalker Williams syndrome,"Familial short stature, developmental delay, p..."
20431,D031368,,Viral disease,,Zoster Sine Herpete,Zoster Sine Eruptione
20432,C536729,,Cardiovascular disease|Congenital abnormality|...,"[DO:DOID:0112152, OMIM:280000]",Zunich neuroectodermal syndrome,"CHIME|CHIME syndrome|COLOBOMA, CONGENITAL HEAR..."
20433,C565223,,Congenital abnormality|Musculoskeletal disease,[OMIM:609815],Zygodactyly 1,ZD1


# DOID

In [7]:
important_columns_doid = {
    'Preferred Label',
    'Synonyms',
    'Obsolete',
    'database_cross_reference',
    'Class ID'
}

In [8]:
doid_df = pd.read_csv(
    "data/Disease/DOID.csv",      # relative python path to subdirectory
    sep=',',
    usecols=important_columns_doid,
    dtype=str
)

print(doid_df.shape)
doid_df.head()

(17454, 5)


Unnamed: 0,Class ID,Preferred Label,Synonyms,Obsolete,database_cross_reference
0,http://purl.obolibrary.org/obo/DOID_8986,narcolepsy,"Narcolepsy, without cataplexy|paroxysmal sleep",False,EFO:0000614|MESH:D009290|GARD:7162|ICD10CM:G47...
1,http://purl.obolibrary.org/obo/DOID_7233,adult central nervous system embryonal carcinoma,Embryonal carcinoma of the adult central nervo...,False,NCI:C5790|UMLS_CUI:C1370503
2,http://purl.obolibrary.org/obo/HP_0011138,Abnormality of skin adnexa morphology,,False,
3,http://purl.obolibrary.org/obo/DOID_5236,subungual glomus tumor,Subungual Glomus tumour|Subungual Glomus tumor...,False,UMLS_CUI:C1304510|NCI:C36079|SNOMEDCT_US_2020_...
4,http://purl.obolibrary.org/obo/DOID_1934,dysostosis,,False,SNOMEDCT_US_2020_09_01:109420003|NCI:C34560|ME...


In [9]:
doid_df['id'] = doid_df['Class ID'].apply(lambda x: x.split('/')[-1].replace('_',':'))
doid_df = doid_df.drop()

In [10]:
doid_df.loc[doid_df['Preferred Label'] == 'Severe acute respiratory syndrome coronavirus 2']

Unnamed: 0,Class ID,Preferred Label,Synonyms,Obsolete,database_cross_reference,id
9418,http://purl.obolibrary.org/obo/NCBITaxon_2697049,Severe acute respiratory syndrome coronavirus 2,SARS2|Human coronavirus 2019|Wuhan seafood mar...,False,,NCBITaxon:2697049


## Clean DataFrame

In [159]:
doid_df = doid_df[doid_df['Obsolete']=='false']
print(doid_df.shape)

(15002, 5)


In [160]:
doid_df = doid_df.drop(['Obsolete'],axis=1)

In [161]:
doid_df['database_cross_reference'] = doid_df['database_cross_reference'].apply(lambda x: [i for i in x.split('|')] if isinstance(x,str) else np.nan)

In [162]:
doid_df

Unnamed: 0,Preferred Label,Synonyms,database_cross_reference,id
0,narcolepsy,"Narcolepsy, without cataplexy|paroxysmal sleep","[EFO:0000614, MESH:D009290, GARD:7162, ICD10CM...",DOID:8986
1,adult central nervous system embryonal carcinoma,Embryonal carcinoma of the adult central nervo...,"[NCI:C5790, UMLS_CUI:C1370503]",DOID:7233
2,Abnormality of skin adnexa morphology,,,HP:0011138
3,subungual glomus tumor,Subungual Glomus tumour|Subungual Glomus tumor...,"[UMLS_CUI:C1304510, NCI:C36079, SNOMEDCT_US_20...",DOID:5236
4,dysostosis,,"[SNOMEDCT_US_2020_09_01:109420003, NCI:C34560,...",DOID:1934
...,...,...,...,...
17448,keratosis,,"[UMLS_CUI:C0022593, MESH:D007642, SNOMEDCT_US_...",DOID:161
17450,coloboma of optic nerve,Coloboma of optic disc|Morning glory syndrome,"[OMIM:120430, ICD9CM:377.23, GARD:13354, GARD:...",DOID:11975
17451,acute gonococcal salpingitis,"Gonococcal salpingitis, specified as acute","[ICD9CM:098.17, SNOMEDCT_US_2020_09_01:4537700...",DOID:13942
17452,atrial heart septal defect 4,ASD4|atrial septal defect 4,"[ICD10CM:Q21.1, OMIM:611363]",DOID:0110109


## Extract Mesh and CUI IDs

In [163]:
doid_df['MeshID'] = doid_df.apply(lambda x: separate_id(x.database_cross_reference,'MESH') if isinstance(x.database_cross_reference,list) else np.nan,axis=1)

In [164]:
doid_df['CUI'] = doid_df.apply(lambda x: separate_id(x.database_cross_reference,'UMLS_CUI') if isinstance(x.database_cross_reference,list) else np.nan,axis=1)

In [165]:
doid_df

Unnamed: 0,Preferred Label,Synonyms,database_cross_reference,id,MeshID,CUI
0,narcolepsy,"Narcolepsy, without cataplexy|paroxysmal sleep","[EFO:0000614, GARD:7162, ICD10CM:G47.41, OMIM:...",DOID:8986,D009290,C0027404
1,adult central nervous system embryonal carcinoma,Embryonal carcinoma of the adult central nervo...,[NCI:C5790],DOID:7233,,C1370503
2,Abnormality of skin adnexa morphology,,,HP:0011138,,
3,subungual glomus tumor,Subungual Glomus tumour|Subungual Glomus tumor...,"[NCI:C36079, SNOMEDCT_US_2020_09_01:403973004]",DOID:5236,,C1304510
4,dysostosis,,"[SNOMEDCT_US_2020_09_01:109420003, NCI:C34560]",DOID:1934,D004413,C0013393
...,...,...,...,...,...,...
17448,keratosis,,"[SNOMEDCT_US_2020_09_01:254666005, NCI:C34747]",DOID:161,D007642,C0022593
17450,coloboma of optic nerve,Coloboma of optic disc|Morning glory syndrome,"[OMIM:120430, ICD9CM:377.23, GARD:13354, GARD:...",DOID:11975,C535970,C0155299
17451,acute gonococcal salpingitis,"Gonococcal salpingitis, specified as acute","[ICD9CM:098.17, SNOMEDCT_US_2020_09_01:45377007]",DOID:13942,,C0275654
17452,atrial heart septal defect 4,ASD4|atrial septal defect 4,"[ICD10CM:Q21.1, OMIM:611363]",DOID:0110109,,


# Merge DOID with CTD and MESH

In [166]:
df1 = df_merge.merge(doid_df, on='MeshID', how='outer')

In [167]:
df1

Unnamed: 0,MeshID,CUI_x,type,Cross_reference,Term,Synonyms_x,Preferred Label,Synonyms_y,database_cross_reference,id,CUI_y
0,C000623720,C1629281,Virus,,Autographa californica multiple nuclear polyhe...,Trichoplusia ni multiple nucleopolyhedrovirus|...,,,,,
1,C000624051,C1187077,Virus,,Tobacco necrosis virus A,,,,,,
2,D000071819,C4277719,Virus,,Virophages,Virophage,,,,,
3,D018059,C0206505,Virus,,Birnaviridae,,,,,,
4,C000624066,C1062582,Virus,,Sunflower chlorotic mottle virus,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
32276,C537935,,,,,,lattice corneal dystrophy,"familial amyloid neuropathy, Finnish type",[OMIM:122200],DOID:8943,
32277,D003861,,,,,,depersonalization disorder,Neurotic derealization,"[NCI:C94331, SNOMEDCT_US_2020_09_01:70764005, ...",DOID:11038,C0683416
32278,D003863,,,,,,mental depression,,"[ICD10CM:F32.9, SNOMEDCT_US_2020_09_01:4100600...",DOID:1596,C0011570
32279,C537330,,,,,,Shwachman-Diamond syndrome,Shwachman-Diamond type metaphyseal dysplasia|p...,"[OMIM:260400, GARD:4863, ICD10CM:D61.0]",DOID:0060479,


## Cleaning resultant merge

In [168]:
len(df1[df1['CUI_x'].str.len()>0])

10778

In [169]:
len(df1[df1['CUI_y'].str.len()>0])

6260

In [170]:
df1['CUI'] = df1.apply(lambda x: set_term(x.CUI_x,x.CUI_y),axis=1)

In [171]:
len(df1[df1['CUI'].str.len()>0])

16078

In [172]:
df1['Synonyms_x'] = df1['Synonyms_x'].apply(lambda x: [i for i in x.split('|')] if isinstance(x,str) else np.nan)

In [173]:
df1['Synonyms_y'] = df1['Synonyms_y'].apply(lambda x: [i for i in x.split('|')] if isinstance(x,str) else np.nan)

In [174]:
df1['Synonyms_x'] = df1['Synonyms_x'].apply(lambda x: x if isinstance(x,list) else [])
df1['Synonyms_y'] = df1['Synonyms_y'].apply(lambda x: x if isinstance(x,list) else [])

In [175]:
df1['Synonyms'] = df1['Synonyms_x'] + df1['Synonyms_y']

In [176]:
df1['Synonyms'] = df1['Synonyms'].apply(lambda x: x if len(x)> 0 else np.nan )

In [177]:
df1['Term'] = df1.apply(lambda x: set_term(x.Term,x['Preferred Label']),axis=1)

In [178]:
df1.drop(['CUI_x','Synonyms_x','Preferred Label','Synonyms_y','CUI_y'],axis=1,inplace=True)

In [179]:
df1

Unnamed: 0,MeshID,type,Cross_reference,Term,database_cross_reference,id,CUI,Synonyms
0,C000623720,Virus,,Autographa californica multiple nuclear polyhe...,,,C1629281,[Trichoplusia ni multiple nucleopolyhedrovirus...
1,C000624051,Virus,,Tobacco necrosis virus A,,,C1187077,
2,D000071819,Virus,,Virophages,,,C4277719,[Virophage]
3,D018059,Virus,,Birnaviridae,,,C0206505,
4,C000624066,Virus,,Sunflower chlorotic mottle virus,,,C1062582,
...,...,...,...,...,...,...,...,...
32276,C537935,,,lattice corneal dystrophy,[OMIM:122200],DOID:8943,,"[familial amyloid neuropathy, Finnish type]"
32277,D003861,,,depersonalization disorder,"[NCI:C94331, SNOMEDCT_US_2020_09_01:70764005, ...",DOID:11038,C0683416,[Neurotic derealization]
32278,D003863,,,mental depression,"[ICD10CM:F32.9, SNOMEDCT_US_2020_09_01:4100600...",DOID:1596,C0011570,
32279,C537330,,,Shwachman-Diamond syndrome,"[OMIM:260400, GARD:4863, ICD10CM:D61.0]",DOID:0060479,,"[Shwachman-Diamond type metaphyseal dysplasia,..."


In [180]:
df1['database_cross_reference'] = df1['database_cross_reference'].apply(lambda x: x if isinstance(x, list) else [])

In [181]:
df1['Cross_reference'] = df1['Cross_reference'].apply(lambda x: [i for i in x.split('|')] if isinstance(x,str) else [])

In [182]:
df1['id'] = df1['id'].apply(lambda x: [x] if isinstance(x,str) else [])

In [183]:
df1['cross_reference'] = df1['database_cross_reference'] + df1['Cross_reference'] + df1['id']

In [184]:
df1['cross_reference'] = df1['cross_reference'].apply(lambda x: np.nan if x ==[] else x)

In [185]:
df1.drop(['database_cross_reference','id','Cross_reference'],axis=1,inplace=True)

In [186]:
df1.sample(10)

Unnamed: 0,MeshID,type,Term,CUI,Synonyms,cross_reference
9057,D000281,Therapeutic or Preventive Procedure,"Administration, Intranasal",C0001560,"[Intranasal Administration, Nasal Administrati...",
9470,C406583,Therapeutic or Preventive Procedure,VBVP protocol,C0913474,,
12624,D000381,Nervous system disease|Signs and symptoms,Agraphia,C0001825,"[Acquired Agraphia, Acquired Agraphias, Acquir...","[SNOMEDCT_US_2020_09_01:27206009, ICD10CM:R48...."
3838,C000646558,Bacterium,Marinobacterium jannaschii,C1025023,[Oceanospirillum jannaschii],
31435,,,thoracic cavity element,,,[UBERON:0005178]
524,C000653481,Bacterium,Anabaenopsis circinalis,C5227494,,
12798,D000712,Animal disease|Bacterial infection or mycosis,Anaplasmosis,,"[Anaplasma Infection, Anaplasma Infections, An...",
31999,,,3-methylglutaconic aciduria type 3,,"[Iraqi-Jewish optic atrophy plus, autosomal re...","[OMIM:258501, ORDO:67047, DOID:0110004]"
6054,C537472,Genetic disease (inborn)|Nervous system disease,Miles-Carpenter x-linked mental retardation sy...,C1839735,"[Mental Retardation, X-Linked, with Congenital...","[ORDO:85283, OMIM:314580, DOID:0060815]"
6905,C542540,Genetic disease (inborn)|Nervous system disease,Spinocerebellar ataxia 22,C2746067,,


# ICD 10 Ontology

In [187]:
important_columns_icd = ['Class ID','Preferred Label','Synonyms','Obsolete','CUI','Semantic Types']

In [188]:
icd_df = pd.read_csv(
    "data/Disease/ICD10CM.csv",      # relative python path to subdirectory
    sep=',',
    usecols=important_columns_icd,
    dtype=str
)

print(icd_df.shape)
icd_df.head()

(95798, 6)


Unnamed: 0,Class ID,Preferred Label,Synonyms,Obsolete,CUI,Semantic Types
0,http://purl.bioontology.org/ontology/ICD10CM/T...,"Underdosing of stimulant laxatives, subsequent...",,False,C2879738,http://purl.bioontology.org/ontology/STY/T037
1,http://purl.bioontology.org/ontology/ICD10CM/T...,"Underdosing of histamine H2-receptor blockers,...",,False,C2879688,http://purl.bioontology.org/ontology/STY/T037
2,http://purl.bioontology.org/ontology/ICD10CM/J...,Acute and chronic respiratory failure,Acute on chronic respiratory failure,False,C0264491,http://purl.bioontology.org/ontology/STY/T047
3,http://purl.bioontology.org/ontology/ICD10CM/G...,Neuromyelitis optica [Devic],Demyelination in optic neuritis,False,C0027873|C1395170,http://purl.bioontology.org/ontology/STY/T047
4,http://purl.bioontology.org/ontology/ICD10CM/S...,Dislocation of interphalangeal joint of unspec...,,False,C2868672,http://purl.bioontology.org/ontology/STY/T037


In [189]:
icd_df = icd_df[icd_df['Obsolete']=='false']
icd_df.drop('Obsolete',inplace=True,axis=1)
print(icd_df.shape)

(95798, 5)


In [190]:
icd_df['ICD_id'] = icd_df['Class ID'].apply(lambda x: x.split('/')[-1])
icd_df.drop('Class ID',inplace=True,axis=1)

In [191]:
icd_df['Semantic Types'] = icd_df['Semantic Types'].apply(lambda x: x.split('/')[-1] if isinstance(x,str) else np.nan)

In [192]:
icd_df

Unnamed: 0,Preferred Label,Synonyms,CUI,Semantic Types,ICD_id
0,"Underdosing of stimulant laxatives, subsequent...",,C2879738,T037,T47.2X6D
1,"Underdosing of histamine H2-receptor blockers,...",,C2879688,T037,T47.0X6D
2,Acute and chronic respiratory failure,Acute on chronic respiratory failure,C0264491,T047,J96.2
3,Neuromyelitis optica [Devic],Demyelination in optic neuritis,C0027873|C1395170,T047,G36.0
4,Dislocation of interphalangeal joint of unspec...,,C2868672,T037,S93.119
...,...,...,...,...,...
95793,Incomplete lesion of L1 level of lumbar spinal...,Incomplete lesion of lumbar spinal cord level 1,C2838665,T037,S34.121
95794,"Nondisplaced fracture of neck of scapula, righ...",,C2840799,T037,S42.154A
95795,"Poisoning by other viral vaccines, assault",,C2883915,T037,T50.B93
95796,"Other shellfish poisoning, undetermined, subse...",,C2885182,T037,T61.784D


## Add semantic type

In [193]:
semantic_type_df = pd.read_csv(
    "data/STY.csv",      # relative python path to subdirectory
    sep=',',
    dtype=str,
    usecols=['Class ID','Preferred Label']
)
semantic_type_df

Unnamed: 0,Class ID,Preferred Label
0,http://purl.bioontology.org/ontology/STY/T057,Occupational Activity
1,http://purl.bioontology.org/ontology/STY/T047,Disease or Syndrome
2,http://purl.bioontology.org/ontology/STY/T167,Substance
3,http://purl.bioontology.org/ontology/STY/T066,Machine Activity
4,http://purl.bioontology.org/ontology/STY/T184,Sign or Symptom
...,...,...
122,http://purl.bioontology.org/ontology/STY/T194,Archaeon
123,http://purl.bioontology.org/ontology/STY/T012,Bird
124,http://purl.bioontology.org/ontology/STY/T087,Amino Acid Sequence
125,http://purl.bioontology.org/ontology/STY/T122,Biomedical or Dental Material


In [194]:
semantic_type_df.rename(columns={'Class ID': 'Semantic Types','Preferred Label':'type'}, inplace=True)

In [195]:
semantic_type_df['Semantic Types'] = semantic_type_df['Semantic Types'].apply(lambda x: x.split('/')[-1])

In [196]:
semantic_type_df

Unnamed: 0,Semantic Types,type
0,T057,Occupational Activity
1,T047,Disease or Syndrome
2,T167,Substance
3,T066,Machine Activity
4,T184,Sign or Symptom
...,...,...
122,T194,Archaeon
123,T012,Bird
124,T087,Amino Acid Sequence
125,T122,Biomedical or Dental Material


In [197]:
icd_df = pd.merge(icd_df,semantic_type_df,on='Semantic Types',how='inner')

In [198]:
icd_df.drop('Semantic Types',inplace=True,axis=1)

In [199]:
icd_df['Synonyms'] = icd_df['Synonyms'].apply(lambda x: [f for f in x.split('|')] if isinstance(x,str) else np.nan)

In [200]:
icd_df

Unnamed: 0,Preferred Label,Synonyms,CUI,ICD_id,type
0,"Underdosing of stimulant laxatives, subsequent...",,C2879738,T47.2X6D,Injury or Poisoning
1,"Underdosing of histamine H2-receptor blockers,...",,C2879688,T47.0X6D,Injury or Poisoning
2,Dislocation of interphalangeal joint of unspec...,,C2868672,S93.119,Injury or Poisoning
3,Pedestrian on skateboard injured in collision ...,,C2891755,V03.12XA,Injury or Poisoning
4,Water transport accidents (V90-V94),,C4721415,V90-V94,Injury or Poisoning
...,...,...,...,...,...
95666,Hostility,,C0020039,R45.5,Mental Process
95667,Adjustment disorders,"[Culture shock, Hospitalism in children, Grief...",C0221521|C0018235|C0865406|C0001546,F43.2,Mental Process
95668,Transsexualism,"[Gender dysphoria in adolescents and adults, G...",C4268305|C4237136|C0040765,F64.0,Organism Attribute
95669,Transvestic fetishism,"[Fetishistic transvestism, Transvestic disorder]",C0040774|C4237450,F65.1,Organism Attribute


# Merge all ontologies

In [201]:
df = pd.merge(icd_df,df1,on='CUI',how='outer')

In [202]:
df

Unnamed: 0,Preferred Label,Synonyms_x,CUI,ICD_id,type_x,MeshID,type_y,Term,Synonyms_y,cross_reference
0,"Underdosing of stimulant laxatives, subsequent...",,C2879738,T47.2X6D,Injury or Poisoning,,,,,
1,"Underdosing of histamine H2-receptor blockers,...",,C2879688,T47.0X6D,Injury or Poisoning,,,,,
2,Dislocation of interphalangeal joint of unspec...,,C2868672,S93.119,Injury or Poisoning,,,,,
3,Pedestrian on skateboard injured in collision ...,,C2891755,V03.12XA,Injury or Poisoning,,,,,
4,Water transport accidents (V90-V94),,C4721415,V90-V94,Injury or Poisoning,,,,,
...,...,...,...,...,...,...,...,...,...,...
126568,,,C0275654,,,,,acute gonococcal salpingitis,"[Gonococcal salpingitis, specified as acute]","[ICD9CM:098.17, SNOMEDCT_US_2020_09_01:4537700..."
126569,,,C0035404,,,D012203,,Rh isoimmunization,[Rh incompatibility affecting management of mo...,"[NCI:C113150, SNOMEDCT_US_2020_09_01:199580004..."
126570,,,C0683416,,,D003861,,depersonalization disorder,[Neurotic derealization],"[NCI:C94331, SNOMEDCT_US_2020_09_01:70764005, ..."
126571,,,C0011570,,,D003863,,mental depression,,"[ICD10CM:F32.9, SNOMEDCT_US_2020_09_01:4100600..."


In [203]:
df['Term'] = df.apply(lambda x: set_term(x['Preferred Label'],x.Term),axis=1)

In [204]:
df['Synonyms'] = df.apply(lambda x: set_term_list(x.Synonyms_x,x.Synonyms_y),axis=1)

In [205]:
df['type'] = df.apply(lambda x: set_term(x.type_y,x.type_x),axis=1)

In [206]:
df.drop(['Preferred Label','Synonyms_x','Synonyms_y','type_y','type_x'],inplace=True,axis=1)

In [208]:
df['CUI'] = df['CUI'].apply(lambda x: [f for f in x.split('|')] if isinstance(x,str) else np.nan)

In [211]:
df.sample(10)

Unnamed: 0,CUI,ICD_id,MeshID,Term,cross_reference,Synonyms,type
36259,[C2865770],S89.12,,Salter-Harris Type II physeal fracture of lowe...,,,Injury or Poisoning
55736,[C2864761],S85.149,,"Laceration of anterior tibial artery, unspecif...",,,Injury or Poisoning
95767,"[C0520476, C0030392]",,D010224,"Parainfluenza Virus 3, Human",,"[Parainfluenza Virus Type 3, Para Influenza Vi...",Virus
43364,[C2845567],S52.321,,Displaced transverse fracture of shaft of righ...,,,Injury or Poisoning
52862,[C2870568],T20.49XD,,Corrosion of unspecified degree of multiple si...,,,Injury or Poisoning
62342,[C0160950],S70.0,,Contusion of hip,,,Injury or Poisoning
97103,"[C0995372, C1011953]",,D042062,Beijerinckiaceae,,[Beijerinckia],Bacterium
74584,[C2883138],I82.549,,Chronic embolism and thrombosis of unspecified...,,,Disease or Syndrome
109808,,,D004753,"Enteritis, Transmissible, of Turkeys",,"[Bluecomb of Turkeys, Transmissible Enteritis ...",Animal disease|Viral disease
47515,[C2845570],S52.321C,,Displaced transverse fracture of shaft of righ...,,,Injury or Poisoning


In [212]:
df.count()

CUI                110370
ICD_id              95688
MeshID              21019
Term               126573
cross_reference     13858
Synonyms            25578
type               115515
dtype: int64

# To JSON Disease document

In [213]:
df.fillna('',inplace=True)

In [214]:
documents = []
num = 0;
for i in df.itertuples(index=False):
    disease_document = {}
    disease_document['term'] = i[3]
    disease_document['synonyms']= i[5]
    disease_document['mesh_id']= i[2]
    disease_document['cui']= i[0]
    disease_document['ICD10_id'] = i[1]
    disease_document['semantic_type']= i[6]
    disease_document['cross_references']= i[4]   
    documents.append(disease_document) 
    num+=1
    if num%10000 == 0:
        print(num/len(df)*100,'% diseases processed')

7.900579112448942 % diseases processed
15.801158224897884 % diseases processed
23.701737337346827 % diseases processed
31.60231644979577 % diseases processed
39.50289556224471 % diseases processed
47.403474674693655 % diseases processed
55.30405378714259 % diseases processed
63.20463289959154 % diseases processed
71.10521201204048 % diseases processed
79.00579112448942 % diseases processed
86.90637023693837 % diseases processed
94.80694934938731 % diseases processed


In [217]:
len(documents)

126573

In [218]:
documents[537]

{'term': 'Contusion of ovary, unilateral, sequela',
 'synonyms': '',
 'mesh_id': '',
 'cui': ['C2839692'],
 'ICD10_id': 'S37.421S',
 'semantic_type': 'Injury or Poisoning',
 'cross_references': ''}

## Save and load resultant document

In [219]:
with open('data/Disease/diseases.json', 'w') as fout:
    json.dump(documents, fout)

In [220]:
with open('data/Disease/diseases.json', 'r') as fout:
    diseases = json.loads(fout.read())
    print(len(diseases))

126573
