# Extract ICD10 terms to map

The goal of this notebook is to create a file of ICD10CM terms that contains _only_ the codes from the DS-Determined data where an existing ICD10 mapping did not exist on a Mondo term. The result of the initial mapping of the ICD codes to Mondo is at: https://www.synapse.org/Synapse:syn63923531


In [1]:
# Imports

import pandas as pd

pd.set_option('display.max_colwidth', None)

In [2]:
# Read file of all ICD10 terms extracted using a SPARQL query as described in the README in this folder.

icd10_df = pd.read_csv('../data/icd10_labels.tsv', sep='\t')
display(icd10_df.head())

Unnamed: 0,?curie,?label,?definition,?synonym
0,ICD10CM:A00,Cholera,,Cholera
1,ICD10CM:A00-A09,Intestinal infectious diseases (A00-A09),,Intestinal infectious diseases (A00-A09)
2,ICD10CM:A00-A09,Intestinal infectious diseases (A00-A09),,intestinal infectious diseases
3,ICD10CM:A00-B99,Certain infectious and parasitic diseases (A00-B99),,Certain infectious and parasitic diseases (A00-B99)
4,ICD10CM:A00-B99,Certain infectious and parasitic diseases (A00-B99),,certain infectious and parasitic diseases


In [3]:
# Read file of DS-Determined ICD10 codes

dsd_icd10_df = pd.read_csv('../data/ds-determined-icd10_icd-mondo_mappings.tsv', sep='\t')

display(dsd_icd10_df.head())
display(dsd_icd10_df.nunique())

Unnamed: 0,icd_code,icd_version,mondo_curie,mondo_label,mondo_xref,mondo_is_obsolete,mondo_has_equivalentTo
0,ICD10CM:G93.2,10,,,,,
1,ICD10CM:I31.39,10,,,,,
2,ICD10CM:M08.90,10,,,,,
3,ICD10CM:F63.81,10,MONDO:0001521,intermittent explosive disorder,ICD10CM:F63.81,False,True
4,ICD10CM:F02.80,10,,,,,


icd_code                  1701
icd_version                  1
mondo_curie                114
mondo_label                114
mondo_xref                  64
mondo_is_obsolete            2
mondo_has_equivalentTo       2
dtype: int64

In [4]:
# Filter DS-Determined ICD-Mondo mapping dataframe to only rows with no value for `mondo_curie`

filtered_dsd_icd10_df = dsd_icd10_df[dsd_icd10_df['mondo_curie'].isna()]

display(filtered_dsd_icd10_df.head())
display(filtered_dsd_icd10_df.nunique())

Unnamed: 0,icd_code,icd_version,mondo_curie,mondo_label,mondo_xref,mondo_is_obsolete,mondo_has_equivalentTo
0,ICD10CM:G93.2,10,,,,,
1,ICD10CM:I31.39,10,,,,,
2,ICD10CM:M08.90,10,,,,,
4,ICD10CM:F02.80,10,,,,,
6,ICD10CM:Q21.0,10,,,,,


icd_code                  1637
icd_version                  1
mondo_curie                  0
mondo_label                  0
mondo_xref                   0
mondo_is_obsolete            0
mondo_has_equivalentTo       0
dtype: int64

In [5]:
# Join `filtered_dsd_icd10_df` to `icd10_df` to get the term labels and other annotations for these codes from DS-Determined
# that do not have an existing Mondo database cross-reference to ICD10

combined_df = filtered_dsd_icd10_df.merge(icd10_df, left_on='icd_code', right_on='?curie', how='left')

display(combined_df.head())
display(combined_df.nunique())

Unnamed: 0,icd_code,icd_version,mondo_curie,mondo_label,mondo_xref,mondo_is_obsolete,mondo_has_equivalentTo,?curie,?label,?definition,?synonym
0,ICD10CM:G93.2,10,,,,,,ICD10CM:G93.2,Benign intracranial hypertension,,Benign intracranial hypertension
1,ICD10CM:G93.2,10,,,,,,ICD10CM:G93.2,Benign intracranial hypertension,,Pseudotumor
2,ICD10CM:I31.39,10,,,,,,,,,
3,ICD10CM:M08.90,10,,,,,,ICD10CM:M08.90,"Juvenile arthritis, unspecified, unspecified site",,"Juvenile arthritis, unspecified, unspecified site"
4,ICD10CM:F02.80,10,,,,,,ICD10CM:F02.80,Dementia in other diseases classified elsewhere without behavioral disturbance,,Dementia in other diseases classified elsewhere NOS


icd_code                  1637
icd_version                  1
mondo_curie                  0
mondo_label                  0
mondo_xref                   0
mondo_is_obsolete            0
mondo_has_equivalentTo       0
?curie                    1583
?label                    1579
?definition                  0
?synonym                  2819
dtype: int64

In [6]:
# Let's only keep columns of interest

cols_to_keep = [
    "icd_code",
    "?label",
    "?definition",
    "?synonym"
]

filtered_combined_df = combined_df[cols_to_keep]

display(filtered_combined_df.head())
display(filtered_combined_df.nunique())

Unnamed: 0,icd_code,?label,?definition,?synonym
0,ICD10CM:G93.2,Benign intracranial hypertension,,Benign intracranial hypertension
1,ICD10CM:G93.2,Benign intracranial hypertension,,Pseudotumor
2,ICD10CM:I31.39,,,
3,ICD10CM:M08.90,"Juvenile arthritis, unspecified, unspecified site",,"Juvenile arthritis, unspecified, unspecified site"
4,ICD10CM:F02.80,Dementia in other diseases classified elsewhere without behavioral disturbance,,Dementia in other diseases classified elsewhere NOS


icd_code       1637
?label         1579
?definition       0
?synonym       2819
dtype: int64

In [7]:
# Group by synonyms to have rows for only unique codes and labels

# Group and join unique synonyms as a string
grouped_filtered_combined_df = filtered_combined_df.groupby('icd_code')['?synonym'] \
    .apply(lambda x: ', '.join(sorted(set(x.dropna().astype(str))))) \
    .reset_index(name='grouped_synonym')

# Drop duplicates based on icd_code and keep the first row
deduped_df = filtered_combined_df.drop_duplicates(subset='icd_code', keep='first')

# Merge grouped synonyms into the deduplicated DataFrame
final_df = deduped_df.merge(grouped_filtered_combined_df, on='icd_code', how='left')

# Rename columns
final_df = final_df.rename(columns={
    '?label': 'label',
    '?definition': 'definition',
    '?synonym': 'synonym'
})

display(final_df.head())
display(final_df.nunique())

Unnamed: 0,icd_code,label,definition,synonym,grouped_synonym
0,ICD10CM:G93.2,Benign intracranial hypertension,,Benign intracranial hypertension,"Benign intracranial hypertension, Pseudotumor"
1,ICD10CM:I31.39,,,,
2,ICD10CM:M08.90,"Juvenile arthritis, unspecified, unspecified site",,"Juvenile arthritis, unspecified, unspecified site","Juvenile arthritis, unspecified, unspecified site"
3,ICD10CM:F02.80,Dementia in other diseases classified elsewhere without behavioral disturbance,,Dementia in other diseases classified elsewhere NOS,"Dementia in other diseases classified elsewhere NOS, Dementia in other diseases classified elsewhere without behavioral disturbance, Major neurocognitive disorder in other diseases classified elsewhere"
4,ICD10CM:Q21.0,Ventricular septal defect,,Roger's disease,"Roger's disease, Ventricular septal defect"


icd_code           1637
label              1579
definition            0
synonym            1580
grouped_synonym    1584
dtype: int64

In [8]:
# Save to file
final_df.to_csv('icd10_code_to_map.tsv', sep='\t', index=False)