This notebook contains the code to map SNOMED codes to ICD chapters for coloring the visualizations presented in the paper.

In [None]:
# imports
import string
import pandas as pd
from collections import Counter
from tqdm import tqdm

# define ICD chapters
ICD_CHAPTERS = {
    "A00-B99": "Certain infectious and parasitic diseases",
    "C00-D49": "Neoplasms",
    "D50-D89": "Diseases of the blood and blood-forming organs",
    "E00-E89": "Endocrine, nutritional and metabolic diseases",
    "F01-F99": "Mental, behavioral and neurodevelopmental disorders",
    "G00-G99": "Diseases of the nervous system",
    "H00-H59": "Diseases of the eye and adnexa",
    "H60-H95": "Diseases of the ear and mastoid process",
    "I00-I99": "Diseases of the circulatory system",
    "J00-J99": "Diseases of the respiratory system",
    "K00-K95": "Diseases of the digestive system",
    "L00-L99": "Diseases of the skin and subcutaneous tissue",
    "M00-M99": "Diseases of the musculoskeletal system and connective tissue",
    "N00-N99": "Diseases of the genitourinary system",
    "O00-O9A": "Pregnancy, childbirth and the puerperium",
    "P00-P96": "Certain conditions originating in the perinatal period",
    "Q00-Q99": "Congenital malformations, deformations and chromosomal abnormalities",
    "R00-R99": "Symptoms, signs and abnormal clinical and laboratory findings",
    "S00-T88": "Injury, poisoning and certain other consequences of external causes",
    "V00-Y99": "External causes of morbidity",
    "Z00-Z99": "Factors influencing health status and contact with health services",
    "U00-U85": "Codes for special purposes"
}

# load ICD-9 to ICD-10 conversion dictionary
# available from https://www.nber.org/research/data/icd-9-cm-and-icd-10-cm-and-icd-10-pcs-crosswalk-or-general-equivalence-mappings
icd9to10 = pd.read_csv("<conversion table CSV>")
icd9to10 = icd9to10[["icd9cm", "icd10cm"]].set_index("icd9cm").to_dict()["icd10cm"]

# given code string, return ICD chapter (if it maps)
def get_icd10_chapter(code):
    
    # determine chapter from first character
    c = code[0]
    chapter = None
    if c in ("A", "B"):
        chapter = "A00-B99"
    elif c == "C":
        chapter = "C00-D49"
    elif c == "D":
        if int(code[1]) < 5:
            chapter = "C00-D49"
        else:
            chapter = "D50-D89"
    elif c == "E":
        chapter = "E00-E89"
    elif c == "F":
        chapter = "F01-F99"
    elif c == "G":
        chapter = "G00-G99"
    elif c == "H":
        if int(code[1]) < 6:
            chapter = "H00-H59"
        else:
            chapter = "H60-H95"
    elif c == "I":
        chapter = "I00-I99"
    elif c == "J":
        chapter = "J00-J99"
    elif c == "K":
        chapter = "K00-K95"
    elif c == "L":
        chapter = "L00-L99"
    elif c == "M":
        chapter = "M00-M99"
    elif c == "N":
        chapter = "N00-N99"
    elif c == "O":
        chapter = "O00-O9A"
    elif c == "P":
        chapter = "P00-P96"
    elif c == "Q":
        chapter = "Q00-Q99"
    elif c == "R":
        chapter = "R00-R99"
    elif c in ("S", "T"):
        chapter = "S00-T88"
    elif c in ("V", "W", "X", "Y"):
        chapter = "V00-Y99"
    elif c == "Z":
        chapter = "Z00-Z99"
    elif c == "U":
        chapter = "U00-U85"

    # if we've mapped, return
    if chapter:
        return chapter
    
    # see if it's an ICD-9 code, convert if so
    stripped = code.translate(str.maketrans('', '', string.punctuation))
    try:
        converted = icd9to10[stripped]
    except KeyError:
        converted = None
        
    # if conversion was successful, recurse
    if converted:
        return get_icd10_chapter(converted)
    else:
        return None

# load and process SNOMED to ICD-10 map from National Library of Medicine
# available from https://www.nlm.nih.gov/research/umls/mapping_projects/snomedct_to_icd10cm.html
results = pd.read_csv("<aggregate results filepath>")
mapping = pd.read_csv("<path to downloaded TSV>", sep="\t")
snomed_to_source = mapping[["referencedComponentName", "mapTarget"]]
snomed_to_source = snomed_to_source[~snomed_to_source["mapTarget"].isnull()]
snomed_to_source = snomed_to_source.groupby("referencedComponentName")["mapTarget"].apply(set).to_dict()

# create dictionary of SNOMED concept ID to name
snomed_to_name = results[["snomed", "name"]]
snomed_to_name = snomed_to_name.set_index("snomed").to_dict()["name"]

# convert to chapters
conversion_table = []
failed_concepts = []
for s in tqdm(results["snomed"]):
    
    # check if we have source values for this concept
    try:
        source_values = snomed_to_source[snomed_to_name[s]]
    except KeyError:
        failed_concepts.append(s)
        continue
    
    # convert each source value
    converted_codes = [get_icd10_chapter(c) for c in source_values]
    converted_codes = list(set([c for c in converted_codes if c is not None]))
    
    # if there's only one chapter, record it
    if len(converted_codes) == 1:
        conversion_table.append({
            "snomed": s,
            "chapter": converted_codes[0],
            "chapter_name": ICD_CHAPTERS[converted_codes[0]]
        })
        
    # if there's more than one, get the majority
    if len(converted_codes) > 1:
        winner = Counter(converted_codes).most_common()[0][0]
        conversion_table.append({
            "snomed": s,
            "chapter": winner,
            "chapter_name": ICD_CHAPTERS[winner]
        })
    
    # if there's none, conversion failed
    if len(converted_codes) == 0:
        failed_concepts.append(s)

# make and save DataFrame
conversion_table = pd.DataFrame.from_records(conversion_table)
conversion_table.to_csv("<SNOMED to ICD-10 chapter filepath>", index=False)