In [1]:
%reset -f

In [2]:
import pandas as pd
from sssom.parsers import parse_sssom_table
from sssom import compare_dataframes
from sssom.parsers import split_dataframe
from sssom.util import MappingSetDataFrame
from os.path import join
from oaklib import OntologyResource
from oaklib.implementations import SqlImplementation
import textdistance

In [3]:
lexmatch_file = "../mappings/mondo-sources-all-lexical.sssom.tsv"
mondo_sssom = "../ontology/tmp/mondo.sssom.tsv"
dir_name = "dataframes"

In [4]:
# Functions

def add_distance(df, col_name, txt_dist_pkg):
    df\
    .insert(\
            len(df.columns),\
            col_name,\
            df.apply\
            (\
                lambda x: txt_dist_pkg\
                 (\
                    x.subject_label.lower(), x.object_label.lower()\
                    if pd.notnull(x.object_label) else "99"\
                 ),\
                axis=1,\
            )\
           )

def print_prefixes(df):
    object_prefixes = df['object_id'].str.split(':').apply(lambda x: x[0] ).drop_duplicates()
    subject_prefixes = df['subject_id'].str.split(':').apply(lambda x: x[0] ).drop_duplicates()
    predicate_ids = df['predicate_id'].drop_duplicates()

    print(f"subject_prefixes:\n {subject_prefixes} \n \
          object_prefixes:\n {object_prefixes} \n \
          predicate_ids:\n {predicate_ids} ")

    
def flip_predicate(predicate_id):
    flip_dict = {
        "skos:closeMatch": "skos:relatedMatch",
        "skos:relatedMatch": "skos:closeMatch",
        "skos:narrowMatch" : "skos:broadMatch",
        "skos:broadMatch" : "skos:narrowMatch",
        "skos:exactMatch" : "skos:exactMatch"
    }
    
    return flip_dict[predicate_id]

def compare_and_comment_df(mondo_df, lex_df):
    df = compare_dataframes(mondo_df, lex_df).combined_dataframe
    df['comment'] = df['comment'].str.replace('UNIQUE_1', "MONDO_MAPPINGS")
    df['comment'] = df['comment'].str.replace('UNIQUE_2', "LEXMATCH")
    return df

def get_unmapped_df(comparison_df):
    mappings = ["LEXMATCH", "MONDO_MAPPINGS"]
    unmapped_df = comparison_df[
        (comparison_df['comment'].str.contains("|".join(mappings)))
    ]
    return unmapped_df

def export_unmatched_exact(unmapped_df, match_type, fn):
    # Determine which list to filter off of.
    if match_type == "LEXMATCH":
        filter_list = in_lex_but_not_mondo_list
    else:
        filter_list = in_mondo_but_not_lex_list
        
    unmapped_exact = unmapped_df[(unmapped_df['comment'] == match_type) & (unmapped_df['predicate_id'] == 'skos:exactMatch')]
    unmapped_exact = unmapped_exact[unmapped_exact['object_id'].str.contains("|".join(filter_list))]
    unmapped_exact.to_csv(join(dir_name, fn), sep='\t', index = False)
    return unmapped_exact.head()

def make_msdf(comparison_df, prefix_map, meta):
    combined_msdf = MappingSetDataFrame(df=comparison_df, prefix_map=prefix_map, metadata=meta)
    return combined_msdf

def mapped_curie_list(df):
    all_id_df = pd.DataFrame(pd.concat([df['subject_id'], df['object_id']]))\
                .drop_duplicates(ignore_index=True)
    all_id_df = all_id_df[~all_id_df[0].str.startswith('MONDO')]
    return all_id_df[0].to_list()


In [5]:
%%time
msdf_lex = parse_sssom_table(lexmatch_file)
msdf_mondo = parse_sssom_table(mondo_sssom)

# Use OAK to get 'object_label'
ontology_resource = OntologyResource(slug='../ontology/tmp/merged.db', local=True)
oi = SqlImplementation(ontology_resource)

CPU times: user 3min 22s, sys: 2.74 s, total: 3min 25s
Wall time: 3min 27s


In [6]:
%%time
all_lex_ids = mapped_curie_list(msdf_lex.df)
all_mondo_ids = mapped_curie_list(msdf_mondo.df)

in_lex_but_not_mondo_list = [x for x in all_lex_ids if x not in all_mondo_ids]
in_mondo_but_not_lex_list = [x for x in all_mondo_ids if x not in all_lex_ids]


In [7]:
print_prefixes(msdf_mondo.df)

subject_prefixes:
 0    MONDO
Name: subject_id, dtype: object 
           object_prefixes:
 0           SCTID
1            MESH
2            DOID
3        Orphanet
4            NCIT
5            UMLS
11         OMIMPS
51        ICD10CM
172          OMIM
604        MedDRA
657      ICD10WHO
25506      MEDGEN
Name: object_id, dtype: object 
           predicate_ids:
 0      skos:exactMatch
111    skos:broadMatch
Name: predicate_id, dtype: object 


In [8]:
%%time
# msdf_mondo.df[msdf_mondo.df['object_id'].str.contains('ICD')]
# "ICD10CM", "MONDO"
# msdf_mondo.df = msdf_mondo.df[(condition_1 & condition_2) | (condition_3 & condition_4)]
# msdf_mondo.df['object_label'] = msdf_mondo.df['object_id'].apply(lambda x: oi.label(x))

msdf_mondo.df['object_label'] = msdf_mondo.df['object_id'].apply(lambda x: oi.label(x))
msdf_mondo.df.head()


CPU times: user 1min 14s, sys: 5.16 s, total: 1min 19s
Wall time: 1min 20s


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label
0,MONDO:0000001,disease or disorder,skos:exactMatch,SCTID:64572001,Unspecified,
1,MONDO:0000001,disease or disorder,skos:exactMatch,MESH:D004194,Unspecified,
2,MONDO:0000001,disease or disorder,skos:exactMatch,DOID:4,Unspecified,disease
3,MONDO:0000001,disease or disorder,skos:exactMatch,Orphanet:377788,Unspecified,Disease
4,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C2991,Unspecified,Disease or Disorder


In [9]:
condition_1 = msdf_mondo.df['subject_id'].str.contains("MONDO")
condition_2 = msdf_mondo.df['object_id'].str.contains("ICD10CM")
condition_3 = msdf_mondo.df['object_id'].str.contains('|'.join((["OMIM","OMIMPS"])))
condition_4 = msdf_mondo.df['object_id'].str.contains("Orphanet")
condition_5 = msdf_mondo.df['object_id'].str.contains("DOID")

mondo_icd_df = msdf_mondo.df[condition_1 & condition_2]
mondo_omim_df = msdf_mondo.df[condition_1 & condition_3]
mondo_ordo_df = msdf_mondo.df[condition_1 & condition_4]
mondo_doid_df = msdf_mondo.df[condition_1 & condition_5]

mondo_icd_df.head()
mondo_omim_df.head()
mondo_ordo_df.head()
# mondo_doid_df.head()


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label
3,MONDO:0000001,disease or disorder,skos:exactMatch,Orphanet:377788,Unspecified,Disease
20,MONDO:0000023,infantile liver failure,skos:exactMatch,Orphanet:464724,Unspecified,Fever-associated acute infantile liver failure...
26,MONDO:0000044,hereditary hypophosphatemic rickets,skos:exactMatch,Orphanet:437,Unspecified,Hypophosphatemic rickets
33,MONDO:0000050,isolated congenital growth hormone deficiency,skos:exactMatch,Orphanet:631,Unspecified,Non-acquired isolated growth hormone deficiency
49,MONDO:0000087,polymicrogyria,skos:exactMatch,Orphanet:35981,Unspecified,Polymicrogyria


In [10]:
print_prefixes(msdf_lex.df)

subject_prefixes:
 0             DOID
151307     ICD10CM
166651    ICD10WHO
169860       MONDO
425468        NCIT
458795        OMIM
474649    Orphanet
Name: subject_id, dtype: object 
           object_prefixes:
 0         MONDO
7          NCIT
8      Orphanet
11     ICD10WHO
20         DOID
37      ICD10CM
199        OMIM
Name: object_id, dtype: object 
           predicate_ids:
 0       skos:broadMatch
1       skos:exactMatch
18      skos:closeMatch
176    skos:narrowMatch
Name: predicate_id, dtype: object 


### Flipping subject_id and object_id if MONDO not subject_id prefix

**Predicate impact**

- skos:closeMatch <=> skos:relatedMatch
- skos:narrowMatch < => skos:broadMatch

In [11]:
condition_1 = msdf_lex.df['subject_id'].str.contains("MONDO")
condition_2 = msdf_lex.df['object_id'].str.contains("ICD10CM")
condition_3 = msdf_lex.df['object_id'].str.contains('|'.join((["OMIM","OMIMPS"])))
condition_4 = msdf_lex.df['object_id'].str.contains('|'.join((["ORDO","Orphanet"])))
condition_5 = msdf_lex.df['object_id'].str.contains("DOID")
condition_mondo_obj = msdf_lex.df['object_id'].str.contains("MONDO")
non_mondo_subjects_df =  pd.DataFrame(msdf_lex.df[(~condition_1 & condition_mondo_obj)])
mondo_subjects_df = pd.DataFrame(msdf_lex.df[(condition_1 & ~condition_mondo_obj)])
print(len(mondo_subjects_df))
non_mondo_subjects_df.head()


44371


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string
0,DOID:0001816,angiosarcoma,skos:broadMatch,MONDO:0003022,pediatric angiosarcoma,semapv:LexicalMatching,oaklib,0.8,rdfs:label,oio:hasBroadSynonym,angiosarcoma
1,DOID:0001816,angiosarcoma,skos:exactMatch,MONDO:0016982,angiosarcoma,semapv:LexicalMatching,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,mesh:d006394
2,DOID:0001816,angiosarcoma,skos:exactMatch,MONDO:0016982,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,rdfs:label,oio:hasExactSynonym,angiosarcoma
3,DOID:0001816,angiosarcoma,skos:exactMatch,MONDO:0016982,angiosarcoma,semapv:RegularExpressionReplacement,oaklib,0.8,rdfs:label,oio:hasExactSynonym,angiosarcoma
4,DOID:0001816,angiosarcoma,skos:exactMatch,MONDO:0016982,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,hemangiosarcoma


In [12]:
desired_sequence = ['subject_id', 'subject_label', 'predicate_id', 'object_id',
       'object_label', 'mapping_justification', 'mapping_tool', 'confidence',
       'subject_match_field', 'object_match_field', 'match_string']

new_subjects_df = non_mondo_subjects_df.rename(columns={
    'subject_id':'object_id', 
    'subject_label':'object_label',
    'object_id':'subject_id', 
    'object_label':'subject_label'
    
})


new_subjects_df = new_subjects_df[desired_sequence]
new_subjects_df["predicate_id"] = new_subjects_df["predicate_id"].apply(lambda x: flip_predicate(x))
print(len(new_subjects_df))
new_subjects_df.head()


103462


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string
0,MONDO:0003022,pediatric angiosarcoma,skos:narrowMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,rdfs:label,oio:hasBroadSynonym,angiosarcoma
1,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,mesh:d006394
2,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,rdfs:label,oio:hasExactSynonym,angiosarcoma
3,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:RegularExpressionReplacement,oaklib,0.8,rdfs:label,oio:hasExactSynonym,angiosarcoma
4,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,hemangiosarcoma


### Combine dfs where subject_id prefix is MONDO

In [13]:
lex_df = pd.concat([mondo_subjects_df,new_subjects_df], ignore_index=True)
print(len(lex_df))

lex_df.head()

147833


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string
0,MONDO:0000001,disease or disorder,skos:exactMatch,Orphanet:557493,disorder,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,disorder
1,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C25457,Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,condition
2,MONDO:0000001,disease or disorder,skos:exactMatch,NCIT:C156809,Medical Condition,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,medical condition
3,MONDO:0000004,adrenocortical insufficiency,skos:exactMatch,NCIT:C113211,Hypocortisolemia,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,rdfs:label,hypocortisolemia
4,MONDO:0000022,nocturnal enuresis,skos:exactMatch,OMIM:600631,"enuresis, nocturnal, 1",semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasExactSynonym,bedwetting


In [14]:
# msdf_lex.df[msdf_lex.df['object_id'].str.contains('ICD')]
condition_1 = lex_df['subject_id'].str.contains("MONDO")
condition_2 = lex_df['object_id'].str.contains("ICD10CM")
condition_3 = lex_df['object_id'].str.contains('|'.join((["OMIM","OMIMPS"])))
condition_4 = lex_df['object_id'].str.contains("Orphanet")
condition_5 = lex_df['object_id'].str.contains("DOID")


mondo_icd_lex_df = lex_df[(condition_1 & condition_2)]
mondo_omim_lex_df = lex_df[(condition_1 & condition_3)]
mondo_ordo_lex_df = lex_df[(condition_1 & condition_4)]
mondo_doid_lex_df = lex_df[(condition_1 & condition_5)]

mondo_icd_lex_df.head()
mondo_omim_lex_df.head() # NO ROWS
mondo_ordo_lex_df.head() # NO ROWS
mondo_doid_lex_df.head()


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string
968,MONDO:0004990,breast tumor luminal A or B,skos:narrowMatch,DOID:0060548,luminal breast carcinoma A,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasBroadSynonym,luminal breast cancer
969,MONDO:0004990,breast tumor luminal A or B,skos:narrowMatch,DOID:0060548,luminal breast carcinoma A,semapv:LexicalMatching,oaklib,0.8,oio:hasExactSynonym,oio:hasBroadSynonym,breast tumor luminal
44371,MONDO:0003022,pediatric angiosarcoma,skos:narrowMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,rdfs:label,oio:hasBroadSynonym,angiosarcoma
44372,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,mesh:d006394
44373,MONDO:0016982,angiosarcoma,skos:exactMatch,DOID:0001816,angiosarcoma,semapv:LexicalMatching,oaklib,0.8,rdfs:label,oio:hasExactSynonym,angiosarcoma


In [15]:
%%time
# comparison_ms_diff = compare_dataframes(msdf_mondo.df, msdf_lex.df)
# comparison_df = comparison_ms_diff.combined_dataframe
# comparison_df['comment'] = comparison_df['comment'].str.replace('UNIQUE_1', "MONDO_MAPPINGS")
# comparison_df['comment'] = comparison_df['comment'].str.replace('UNIQUE_2', "LEXMATCH")
# comparison_df.head()

# Comparisons
icd_comparison_df = compare_and_comment_df(mondo_icd_df, mondo_icd_lex_df)
omim_comparison_df = compare_and_comment_df(mondo_omim_df, mondo_omim_lex_df)
ordo_comparison_df = compare_and_comment_df(mondo_ordo_df, mondo_ordo_lex_df)
doid_comparison_df = compare_and_comment_df(mondo_doid_df, mondo_doid_lex_df)


CPU times: user 1min 20s, sys: 179 ms, total: 1min 20s
Wall time: 1min 20s


In [16]:
icd_comparison_df
omim_comparison_df
ordo_comparison_df
doid_comparison_df

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
20210,MONDO:0006872,obsolete nut allergic reaction,skos:exactMatch,DOID:4379,Unspecified,nut allergy,MONDO_MAPPINGS,,,,,
12972,MONDO:0004626,obsolete Hodgkin's paragranuloma,skos:exactMatch,DOID:8642,Unspecified,Hodgkin's paragranuloma,MONDO_MAPPINGS,,,,,
4712,MONDO:0001872,obsolete vestibular nystagmus,skos:exactMatch,DOID:14070,Unspecified,vestibular nystagmus,MONDO_MAPPINGS,,,,,
1342,MONDO:0000857,obsolete Charcot-Marie-Tooth disease type 7,skos:exactMatch,DOID:0080069,Unspecified,Charcot-Marie-Tooth disease type 7,MONDO_MAPPINGS,,,,,
49222,MONDO:0014709,obsolete Heimler syndrome 2,skos:exactMatch,DOID:0080624,Unspecified,Heimler syndrome 2,MONDO_MAPPINGS,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
106025,MONDO:0011698,glycine N-methyltransferase deficiency,skos:exactMatch,DOID:0111037,semapv:LexicalMatching,glycine N-methyltransferase deficiency,COMMON_TO_BOTH,oaklib,0.800000,oio:hasExactSynonym,oio:hasExactSynonym,hypermethioninemia due to glycine n-methyltran...
106026,MONDO:0011698,glycine N-methyltransferase deficiency,skos:exactMatch,DOID:0111037,semapv:LexicalMatching,glycine N-methyltransferase deficiency,COMMON_TO_BOTH,oaklib,0.800000,oio:hasExactSynonym,oio:hasExactSynonym,hypermethioninemia due to gnmt deficiency
106027,MONDO:0011698,glycine N-methyltransferase deficiency,skos:exactMatch,DOID:0111037,semapv:LexicalMatching,glycine N-methyltransferase deficiency,COMMON_TO_BOTH,oaklib,0.800000,rdfs:label,oio:hasExactSynonym,glycine n-methyltransferase deficiency
106028,MONDO:0011698,glycine N-methyltransferase deficiency,skos:exactMatch,DOID:0111037,semapv:LexicalMatching,glycine N-methyltransferase deficiency,COMMON_TO_BOTH,oaklib,0.849779,rdfs:label,rdfs:label,glycine n-methyltransferase deficiency


In [17]:
icd_comparison_df['comment'].drop_duplicates()

53478     MONDO_MAPPINGS
142345          LEXMATCH
61789     COMMON_TO_BOTH
Name: comment, dtype: object

### Split into unmapped dataframes

In [18]:
unmapped_icd_df = get_unmapped_df(icd_comparison_df)
unmapped_omim_df = get_unmapped_df(omim_comparison_df)
unmapped_ordo_df = get_unmapped_df(ordo_comparison_df)
unmapped_doid_df = get_unmapped_df(doid_comparison_df)

unmapped_icd_df.head()


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
53478,MONDO:0016289,malignant germ cell tumor of cervix uteri,skos:broadMatch,ICD10CM:C53.1,Unspecified,Malignant neoplasm of exocervix,MONDO_MAPPINGS,,,,,
16153,MONDO:0005580,esophageal squamous cell carcinoma,skos:broadMatch,ICD10CM:C15.4,Unspecified,Malignant neoplasm of middle third of esophagus,MONDO_MAPPINGS,,,,,
51392,MONDO:0015504,larynx anomaly,skos:broadMatch,ICD10CM:Q31.8,Unspecified,Other congenital malformations of larynx,MONDO_MAPPINGS,,,,,
2348,MONDO:0001176,lens disorder,skos:exactMatch,ICD10CM:H25-H28,Unspecified,Disorders of lens (H25-H28),MONDO_MAPPINGS,,,,,
52378,MONDO:0015894,obsolete rare hyperthyroidism,skos:broadMatch,ICD10CM:E05.2,Unspecified,Thyrotoxicosis with toxic multinodular goiter,MONDO_MAPPINGS,,,,,


In [19]:
print_prefixes(unmapped_doid_df)

subject_prefixes:
 20210    MONDO
Name: subject_id, dtype: object 
           object_prefixes:
 20210    DOID
Name: object_id, dtype: object 
           predicate_ids:
 20210       skos:exactMatch
46166     skos:relatedMatch
122422      skos:broadMatch
128542     skos:narrowMatch
Name: predicate_id, dtype: object 


In [20]:
# Let reviewers check if this makes sense or no.
export_unmatched_exact(unmapped_icd_df, "LEXMATCH", "unmapped_icd_lex.tsv")
export_unmatched_exact(unmapped_omim_df, "LEXMATCH", "unmapped_omim_lex.tsv")
export_unmatched_exact(unmapped_ordo_df, "LEXMATCH", "unmapped_ordo_lex.tsv")
export_unmatched_exact(unmapped_doid_df, "LEXMATCH", "unmapped_doid_lex.tsv")


Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
73989,MONDO:0016006,Cockayne syndrome,skos:exactMatch,DOID:0080911,semapv:LexicalMatching,cerebrooculofacioskeletal syndrome 1,LEXMATCH,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,omim:214150
111940,MONDO:0032618,"mitochondrial complex 1 deficiency, nuclear ty...",skos:exactMatch,DOID:0112076,semapv:LexicalMatching,nuclear type mitochondrial complex I deficienc...,LEXMATCH,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,omim:618235
112898,MONDO:0009605,methemoglobinemia type 4,skos:exactMatch,DOID:0112316,semapv:LexicalMatching,methemoglobinemia and ambiguous genitalia,LEXMATCH,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,omim:250790
73600,MONDO:0019354,Stickler syndrome,skos:exactMatch,DOID:0080676,semapv:LexicalMatching,Stickler syndrome 1,LEXMATCH,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,omim:108300
70109,MONDO:0014131,hypohidrosis-enamel hypoplasia-palmoplantar ke...,skos:exactMatch,DOID:0070141,semapv:LexicalMatching,autosomal recessive cutis laxa type II classic...,LEXMATCH,oaklib,0.941176,oio:hasDbXref,oio:hasDbXref,icd10cm:q82.8


In [21]:
# Inspect why these are missing from SSSOM mappings
export_unmatched_exact(unmapped_icd_df, "MONDO_MAPPINGS", "unmapped_icd_mondo.tsv")
export_unmatched_exact(unmapped_omim_df, "MONDO_MAPPINGS", "unmapped_omim_mondo.tsv")
export_unmatched_exact(unmapped_ordo_df, "MONDO_MAPPINGS", "unmapped_ordo_mondo.tsv")
export_unmatched_exact(unmapped_doid_df, "MONDO_MAPPINGS", "unmapped_doid_mondo.tsv")

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
4712,MONDO:0001872,obsolete vestibular nystagmus,skos:exactMatch,DOID:14070,Unspecified,vestibular nystagmus,MONDO_MAPPINGS,,,,,
1342,MONDO:0000857,obsolete Charcot-Marie-Tooth disease type 7,skos:exactMatch,DOID:0080069,Unspecified,Charcot-Marie-Tooth disease type 7,MONDO_MAPPINGS,,,,,
862,MONDO:0000568,autoimmune disorder of central nervous system,skos:exactMatch,DOID:0060004,Unspecified,autoimmune disease of central nervous system,MONDO_MAPPINGS,,,,,
1248,MONDO:0000793,obsolete rainbow trout allergy,skos:exactMatch,DOID:0060518,Unspecified,rainbow trout allergy,MONDO_MAPPINGS,,,,,
8791,MONDO:0003192,rete ovarii neoplasm,skos:exactMatch,DOID:4895,Unspecified,rete ovarii benign neoplasm,MONDO_MAPPINGS,,,,,


In [22]:
# # Add distances
# # Add Levenshtein distance [ 0: Perfect match]
# add_distance(unmapped_mondo_exact, "levenshtein_dist", textdistance.levenshtein.distance)
# # Add Jaccard Index [ 0: Perfect match]
# add_distance(unmapped_mondo_exact, "jaccard_index", textdistance.jaccard.distance)
# # Add Monge-Elkan Distance [ 0: Perfect match]
# add_distance(unmapped_mondo_exact, "monge_elkan", textdistance.monge_elkan.distance)
# unmapped_mondo_exact.to_csv(join(dir_name, "unmapped_mondo_exact.tsv"), sep='\t', index = False)
# unmapped_mondo_exact.head()


In [23]:
combined_msdf_icd = make_msdf(icd_comparison_df, msdf_lex.prefix_map, msdf_lex.metadata)
combined_msdf_omim = make_msdf(omim_comparison_df, msdf_lex.prefix_map, msdf_lex.metadata)
combined_msdf_ordo = make_msdf(ordo_comparison_df, msdf_lex.prefix_map, msdf_lex.metadata)
combined_msdf_doid = make_msdf(doid_comparison_df, msdf_lex.prefix_map, msdf_lex.metadata)

combined_msdf_icd.df.head()

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,mapping_justification,object_label,comment,mapping_tool,confidence,subject_match_field,object_match_field,match_string
53478,MONDO:0016289,malignant germ cell tumor of cervix uteri,skos:broadMatch,ICD10CM:C53.1,Unspecified,Malignant neoplasm of exocervix,MONDO_MAPPINGS,,,,,
16153,MONDO:0005580,esophageal squamous cell carcinoma,skos:broadMatch,ICD10CM:C15.4,Unspecified,Malignant neoplasm of middle third of esophagus,MONDO_MAPPINGS,,,,,
51392,MONDO:0015504,larynx anomaly,skos:broadMatch,ICD10CM:Q31.8,Unspecified,Other congenital malformations of larynx,MONDO_MAPPINGS,,,,,
2348,MONDO:0001176,lens disorder,skos:exactMatch,ICD10CM:H25-H28,Unspecified,Disorders of lens (H25-H28),MONDO_MAPPINGS,,,,,
52378,MONDO:0015894,obsolete rare hyperthyroidism,skos:broadMatch,ICD10CM:E05.2,Unspecified,Thyrotoxicosis with toxic multinodular goiter,MONDO_MAPPINGS,,,,,


In [24]:
# %%time
# df_dict = split_dataframe(combined_msdf)

In [25]:
# mondo_icd_list = [x for x in list(df_dict.keys()) if 'mondo' in x and "icd10" in x]
# mondo_icd_list

In [26]:
# for match in mondo_icd_list:
#     fn = match + ".tsv"
#     df_dict[match].df.to_csv(join(dir_name,fn), sep='\t', index = False)

In [27]:
# df_dict['mondo_exactmatch_icd10cm'].df