# 0. Data Preparation

1. Transform all the text information into utf8 format
2. Make dictionaries to map concept, ordo_id, and corresponding synonyms together
3. Prepare the relationship of the snomed terms

In [2]:
import pandas as pd
import numpy as np
import re
import json
from queue import LifoQueue
from fuzzywuzzy import fuzz
import os
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize
import nltk
nltk.download('stopwords')
import requests
import getpass
from bs4 import BeautifulSoup
import sys

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/liangyuzhao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 0-I Get the ORDO information

In [64]:
# ## Read vocab csv file into dataframe
ORDO = pd.read_csv("Vocabularies/ORDO.csv", sep = ',')

SNOMED = pd.read_csv("Vocabularies/SnomedCT_USEditionRF2_PRODUCTION_20190901T120000Z/Full/Terminology/sct2_Description_Full-en_US1000124_20190901.txt",
                    sep = '\t')
## Rename ORDO headers by stripping URLs

ORDOcols = list(ORDO.columns)

for i, colname in enumerate(ORDOcols):
    # Check for URL
    if "http" in colname:
        # Reverse split (get only the substring behind the last / in the URL)
        newcolname = colname.rsplit('/', 1)[1]
        # Rename the column in ORDO dataframe
        ORDO = ORDO.rename(columns = {colname: newcolname})
## Extract ORDO disease terms (might expand to other classes later)
"""
'Parents' column has URL for 2nd level classes
Clinical entity subclasses:
    Biological anomaly: http://www.orpha.net/ORDO/Orphanet_377790
    Clinical subtype: http://www.orpha.net/ORDO/Orphanet_377796
    Clinical syndrome: http://www.orpha.net/ORDO/Orphanet_377792
    Disease: http://www.orpha.net/ORDO/Orphanet_377788
    Etiological subtype: http://www.orpha.net/ORDO/Orphanet_377795
    Group of disorders: http://www.orpha.net/ORDO/Orphanet_377794
    Histopathological subtype: http://www.orpha.net/ORDO/Orphanet_377797
    Malformation syndrome: http://www.orpha.net/ORDO/Orphanet_377789
    Morphological anomaly: http://www.orpha.net/ORDO/Orphanet_377791
    Particular clinical situation in disease/syndrome: http://www.orpha.net/ORDO/Orphanet_377793   
"""
ORDOdiseases = ORDO[ORDO["Parents"] == "http://www.orpha.net/ORDO/Orphanet_377788"]
print("The total number of ORDO disease terms: " + str(len(ORDOdiseases["Class ID"].unique())))
## Total number of SNOMED disease terms
SNOMEDdiseases = SNOMED[SNOMED['term'].str.contains("\(disorder\)") == True]

unique_SNOMED = SNOMEDdiseases.drop_duplicates(subset = 'term')

print("The total number of SNOMED disease terms: " + str(len(unique_SNOMED)))

The total number of ORDO disease terms: 3859
The total number of SNOMED disease terms: 115396


In [65]:
## Write all ORDO disease terms and SNOMED disease terms to files
basepath = "./Vocabularies/"
ORDOdiseases.to_csv(basepath + "ORDO_diseases.csv")
unique_SNOMED.to_csv(basepath + "SNOMED_diseases.csv")  

# I. Find Exact Match & UMLS Matches

### A. Find the Exact Match between ORDO and SNOMED

In [66]:
## Number of ORDO disease terms with/without SNOMED similarity
SNOMEDterms = list(unique_SNOMED['term'])
SNOMEDlower = [(term.replace(" (disorder)", "")).lower() for term in SNOMEDterms]

columns = ['ORDO_ID', 'ORDO_term', 'ORDO_syn', 'ORDO_definition', 'ORDO_mappings', 
          'SNOMED_ID', 'SNOMED_term']
st_dict = dict((key,[]) for key in columns)
dt_dict = dict((key,[]) for key in columns)

for i, term in enumerate(ORDOdiseases["Preferred Label"]):
    if term.lower() in SNOMEDlower:
        # Get SNOMED ID for term and add to similar terms dictionary
        snomedIdx = SNOMEDlower.index(term.lower())
        snomedID = unique_SNOMED.iloc[snomedIdx]['conceptId']
        snomedTerm = unique_SNOMED.iloc[snomedIdx]['term']
        st_dict['SNOMED_ID'] += [snomedID]
        st_dict['SNOMED_term'] += [snomedTerm]
        
        # Add to ORDO info to similar terms dictionary
        ORDO_ID, ORDO_term = ORDOdiseases.iloc[i]['Class ID'], ORDOdiseases.iloc[i]['Preferred Label']
        ORDO_syn, ORDO_def, ORDO_mappings = ORDOdiseases.iloc[i]['Synonyms'], ORDOdiseases.iloc[i]['Definitions'], ORDOdiseases.iloc[i]['oboInOwl#hasDbXref']
        st_dict['ORDO_ID'] += [ORDO_ID]
        st_dict['ORDO_term'] += [ORDO_term] 
        st_dict['ORDO_syn'] += [ORDO_syn]
        st_dict['ORDO_definition'] += [ORDO_def]
        st_dict['ORDO_mappings'] += [ORDO_mappings]
    
    else:
        # Add to ORDO info to dissimilar terms dictionary
        ORDO_ID, ORDO_term = ORDOdiseases.iloc[i]['Class ID'], ORDOdiseases.iloc[i]['Preferred Label']
        ORDO_syn, ORDO_def, ORDO_mappings = ORDOdiseases.iloc[i]['Synonyms'], ORDOdiseases.iloc[i]['Definitions'], ORDOdiseases.iloc[i]['oboInOwl#hasDbXref']
        dt_dict['ORDO_ID'] += [ORDO_ID]
        dt_dict['ORDO_term'] += [ORDO_term] 
        dt_dict['ORDO_syn'] += [ORDO_syn]
        dt_dict['ORDO_definition'] += [ORDO_def]
        dt_dict['ORDO_mappings'] += [ORDO_mappings]
        dt_dict['SNOMED_ID'] += [""]
        dt_dict['SNOMED_term'] += [""]
        
# Convert to dataframes
similar_terms = pd.DataFrame.from_dict(st_dict)
dissimilar_terms = pd.DataFrame.from_dict(dt_dict)

print("Number of ORDO terms matched to SNOMED by string matching: " + str(len(similar_terms)))
print("Number of ORDO terms not matched to SNOMED by string matching: " + str(len(dissimilar_terms)))

Number of ORDO terms matched to SNOMED by string matching: 1526
Number of ORDO terms not matched to SNOMED by string matching: 2333


In [67]:
ordo_id = []
concept = []
score = []
snomedid = []
snomedterm = []
for i in range(len(similar_terms.ORDO_ID.tolist())):
    ordo = similar_terms.ORDO_ID.tolist()[i]
    ordo = ordo.split('/')[-1].split('_')[1]
    ordo_id.append(ordo)
    score.append(1000)
    concept.append(similar_terms.ORDO_term.tolist()[i])
    snomedid.append(similar_terms.SNOMED_ID.tolist()[i])
    snomedterm.append(similar_terms.SNOMED_term.tolist()[i])
df_Mapped_ORDO_SNOMED_Direct = pd.DataFrame({'ORDO-ID': ordo_id, 
                                      'Concept':concept,
                                     'Mapped_Term': concept,
                                     'SNOMED_ID': snomedid,
                                     'SNOMED_Term': snomedterm,
                                     'Score': score})

In [68]:
df_Mapped_ORDO_SNOMED_Direct.to_csv("Results/Mapped_Results/Mapped_ORDO_SNOMED_Direct.txt",sep="\t")

dissimilar_terms.to_csv('ORDO_info/ORDO-SNOMED-not-matched.csv')

### B. Find the ORDO-SNOMED Map through UMLS

In [70]:
## For ORDO terms unmatched to SNOMED: Which and how many have UMLS mappings to SNOMED?
### NOTE: This will take a while to run since regex is slow, and each API request takes a few seconds ### 

UMLS_mapped = dissimilar_terms[dissimilar_terms['ORDO_mappings'].str.contains("UMLS", na = False) == True]
UMLS_unmapped = dissimilar_terms[dissimilar_terms['ORDO_mappings'].str.contains("UMLS", na = False) == False]

CUIpattern = re.compile("C\d{7}")

# UMLS API requires getting TGT every 8 hours: https://documentation.uts.nlm.nih.gov/rest/authentication.html
# Or just get every run (below)
headers = {"Content-Type": "application/x-www-form-urlencoded"}

user = input("Please enter your username: ")
pw = getpass.getpass("Please enter your password: ")
params = {"username" : user, 
         "password" : pw}

TGT_URL = "https://utslogin.nlm.nih.gov/cas/v1/tickets"

response = (requests.post(TGT_URL, headers = headers, params = params)).text
ticketgetter = BeautifulSoup(response)
TGT = ticketgetter.form['action']

# For service ticket request
headers_ST = {"Content-Type": "application/x-www-form-urlencoded"}
params = {"service": "http://umlsks.nlm.nih.gov"}

# Check CUIs for each mapping in UMLS_mapped
columns = ['ORDO_ID', 'ORDO_term', 'ORDO_syn', 'ORDO_definition', 'ORDO_mappings', 
          'SNOMED_ID', 'SNOMED_term']
mapped_dict = dict((key,[]) for key in columns)
unmapped_dict = dict((key,[]) for key in columns)
counter = 0
for i, mapping in enumerate(UMLS_mapped['ORDO_mappings']):
    CUI_list = re.findall(CUIpattern, str(mapping))
    CUI = CUI_list[0]
    # Request UMLS concept info from API         
    # Every request requires a service ticket (use TGT to get service ticket)
    ST = requests.post(TGT, headers = headers_ST, params = params)
    URL = "https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{}/atoms?sabs=SNOMEDCT_US&ticket={}".format(CUI, ST.text)
    response = requests.get(URL)    

    # Check if SNOMED-CT is mapped; get SNOMED ID
    if response.text[0] == "{":     # "{" indicates start of response with AUIs; "<" indicates page not found (no AUIs)
        atom_dict = json.loads(response.text)
        atom_list = atom_dict['result']
        # Atom list contains 2 SNOMED atoms, but both are same concept/code. Can use second one which is Fully-specified name
        atom = atom_list[0]
        SNOMED_term = atom['name']
        SNOMED_code = atom['code'].rsplit('/', 1)[1]
        # Add term info to mapped_dict
        mapped_dict['ORDO_ID'] += [UMLS_mapped.iloc[i]['ORDO_ID']]
        mapped_dict['ORDO_term'] += [UMLS_mapped.iloc[i]['ORDO_term']] 
        mapped_dict['ORDO_syn'] += [UMLS_mapped.iloc[i]['ORDO_syn']]
        mapped_dict['ORDO_definition'] += [UMLS_mapped.iloc[i]['ORDO_definition']]
        mapped_dict['ORDO_mappings'] += [UMLS_mapped.iloc[i]['ORDO_mappings']]
        mapped_dict['SNOMED_ID'] += [SNOMED_code]
        mapped_dict['SNOMED_term'] += [SNOMED_term]

    else:
        # Add term info to unmapped_dict
        unmapped_dict['ORDO_ID'] += [UMLS_mapped.iloc[i]['ORDO_ID']]
        unmapped_dict['ORDO_term'] += [UMLS_mapped.iloc[i]['ORDO_term']] 
        unmapped_dict['ORDO_syn'] += [UMLS_mapped.iloc[i]['ORDO_syn']]
        unmapped_dict['ORDO_definition'] += [UMLS_mapped.iloc[i]['ORDO_definition']]
        unmapped_dict['ORDO_mappings'] += [UMLS_mapped.iloc[i]['ORDO_mappings']]
        unmapped_dict['SNOMED_ID'] += [""]
        unmapped_dict['SNOMED_term'] += [""]
                
    counter += 1
    if counter % 100 == 0:
        runs_left = len(UMLS_mapped) - counter
        print("{} runs left".format(runs_left))


mapped = pd.DataFrame.from_dict(mapped_dict)
unmapped = pd.DataFrame.from_dict(unmapped_dict)
unmapped = pd.concat([UMLS_unmapped, unmapped])

print("Number of ORDO terms mapped to SNOMED through UMLS: " + str(len(mapped)))
print("Number of ORDO terms not mapped to UMLS or to SNOMED through UMLS: " + str(len(unmapped)))



Please enter your username: ilyia1997
Please enter your password: ········
1137 runs left
1037 runs left
937 runs left
837 runs left
737 runs left
637 runs left
537 runs left
437 runs left
337 runs left
237 runs left
137 runs left
37 runs left
Number of ORDO terms mapped to SNOMED through UMLS: 909
Number of ORDO terms not mapped to UMLS or to SNOMED through UMLS: 1424


In [71]:
ordo_id = []
concept = []
score = []
snomedid = []
snomedterm = []
for i in range(len(mapped.ORDO_ID.tolist())):
    ordo = mapped.ORDO_ID.tolist()[i]
    ordo = ordo.split('/')[-1].split('_')[1]
    ordo_id.append(ordo)
    score.append(1000)
    concept.append(mapped.ORDO_term.tolist()[i])
    snomedid.append(mapped.SNOMED_ID.tolist()[i])
    snomedterm.append(mapped.SNOMED_term.tolist()[i])
df_Mapped_ORDO_SNOMED_UMLS = pd.DataFrame({'ORDO-ID': ordo_id, 
                                      'Concept':concept,
                                     'Mapped_Term': concept,
                                     'SNOMED_ID': snomedid,
                                     'SNOMED_Term': snomedterm,
                                     'Score': score})
df_Mapped_ORDO_SNOMED_UMLS.to_csv("Results/Mapped_Results/Mapped_ORDO_SNOMED_UMLS.txt",sep="\t")



In [72]:
import sys
all_need_metamap = unmapped[['ORDO_ID','ORDO_term','ORDO_syn']]
ordo_id = []
for i in range(len(all_need_metamap.ORDO_ID.tolist())):
    ordo = all_need_metamap.ORDO_ID.tolist()[i]
    ordo = ordo.split('/')[-1].split('_')[1]
    ordo_id.append(ordo)
all_need_metamap = all_need_metamap.reset_index()
del all_need_metamap['index']
all_need_metamap['ORDO_ID'] = ordo_id
all_need_metamap = all_need_metamap.fillna(0)
all_need_metamap.to_csv("ORDO_info/all_need_metamap.txt",sep='\t')

## A. Transform the 1424 Concepts into utf8 Format

In [73]:
!java -jar replace_utf8.jar ORDO_info/all_need_metamap.txt > ORDO_info/all_need_metamap.txt.utf8

## B. Make 6 Dictionaries for Concepts, IDs and Synonyms

In [74]:
df_all_metamap = pd.read_csv('ORDO_info/all_need_metamap.txt.utf8', sep='\t')

#dict_id_concept is the dictionary with 
        #the key of id, and value of concept
dict_id_concept = dict(zip(df_all_metamap.ORDO_ID,df_all_metamap.ORDO_term))
#dict_concept_id is the dictionary with 
        #the key of concept, and value of id
dict_concept_id = dict(zip(df_all_metamap.ORDO_term,df_all_metamap.ORDO_ID))
#dict_concept_syn is the dictionary with
        #the key of concept, and the value of syn list
dict_concept_syn = {}
#dict_id_syn is the dictionary with
    #the key of id, and the value of syn list
dict_id_syn = {}
#dict_syn_concept is the dictionary with
        # the key of each syn terms, and the value of corresponding concept
dict_syn_concept = {}
#dict_syn_id is the dictionary with
        # the key of each syn terms, and the value of corresponding id
dict_syn_id = {}
#because some do not have syn, so only keep the one has syns in the dict
for i in range(len(list(df_all_metamap.ORDO_ID))):
    concept = list(df_all_metamap.ORDO_term)
    synonym = list(df_all_metamap.ORDO_syn)
    ordo_id = list(df_all_metamap.ORDO_ID)
    cpt = concept[i]
    syn = synonym[i].split('|')
    o_id = ordo_id[i]
    if cpt not in dict_concept_syn:
        if '0' not in syn:
            dict_concept_syn[cpt] = syn
    if o_id not in dict_id_syn:
        if '0' not in syn:
            dict_id_syn[o_id] = syn
    if '0' not in syn:
        for syn_term in syn:
            if syn_term not in dict_syn_concept:
                dict_syn_concept[syn_term] = cpt
            if syn_term not in dict_syn_id:
                dict_syn_id[syn_term] = o_id

In [75]:
print("Here is the number of concepts: "+ str(len(dict_concept_id)))
print("Here is the total number of the synonyms for these 1424 concepts: "+ str(len(dict_syn_concept)))

Here is the number of concepts: 1424
Here is the total number of the synonyms for these 1424 concepts: 2122


## C. Prepare the Relationship Between Snomed-terms

### (1) Get the snomed_term information form 'sct2_Relationship_Full' file

In [6]:
df_relationship = pd.read_csv('Vocabularies/SnomedCT_USEditionRF2_PRODUCTION_20190901T120000Z/Full/Terminology/sct2_Relationship_Full_US1000124_20190901.txt',sep='\t')
df_relationship = df_relationship[df_relationship.active != 0]
df_relationship_is = df_relationship.copy(deep = True)
df_relationship_is = df_relationship_is[df_relationship_is['typeId'] == 116680003]
df_relationship_is.shape

(1011286, 10)

### (2) Filter Out the Ordo_terms that Contains ICD information

In [7]:
!java -jar replace_utf8.jar ORDO_info/UMLS-unmapped.txt > ORDO_info/UMLS-unmapped.txt.utf8

In [8]:
def extract_icd_terms(file):
    everything = pd.read_csv(file, sep='\t')
    IDs = everything["ORDO_ID"].str.extract('_([0-9]+)')
    everything["ORDO_ID"] = IDs.values.astype(int)
    everything = everything[everything.ORDO_mappings.str.contains("ICD-10:")]
    ICD_list = []
    for items in list(everything.ORDO_mappings):
        items = items.split('|')
        multi = []
        for item in items:
            if item.startswith("ICD"):
                multi.append(item.split(":")[1])
        ICD_list.append(";".join(multi))
    everything.ORDO_mappings = ICD_list
    new_dataframe = everything[["ORDO_ID","ORDO_term", "ORDO_syn", "ORDO_mappings"]]
    new_dataframe.columns = ["OR-ID", "Concepts", "Synonyms", "ICD-ID"]
    new_dataframe.to_csv("ORDO_info/unmapped_ORDO_with_ICD.txt.utf8", sep = '\t')
    return new_dataframe

In [9]:
file_path = "ORDO_info/UMLS-unmapped.txt.utf8"
df_Ordo_ICD = extract_icd_terms(file_path)
concept_icd = dict(zip(df_Ordo_ICD.Concepts, df_Ordo_ICD['ICD-ID']))

### (3) Get the Relationship of Snomed Term's Concept, ID and Semantic Types

In [10]:
description_file = "Vocabularies/SnomedCT_USEditionRF2_PRODUCTION_20190901T120000Z/Full/Terminology/sct2_Description_Full-en_US1000124_20190901.txt"
df_description = pd.read_csv(description_file,sep='\t')
df_description = df_description[df_description.active != 0]
df_description = df_description[['conceptId','typeId','term']]
df_description = df_description[df_description.typeId == 900000000000003001]
df_description["Pharse"] = df_description.term.str.extract("(\([^\)]*\)$)")
df_description = df_description[df_description.Pharse.notnull()]
snomedid_pharse = dict(zip(df_description.conceptId,df_description.Pharse))
snomedid_pharse[10743271000119104]='(disorder)'
snomedid_term = dict(zip(df_description.conceptId,df_description.term))
snomedid_term[10743271000119104]='Immunoglobulin G4 related disease (disorder)'

In [11]:
snomedid_pharse[723363009] = '(disorder)'
snomedid_term[723363009] = 'Hypotrichosis, lymphedema, telangiectasia, renal defect syndrome (disorder)'

# II. MetaMap for String Match

### A. Metamap for the 1424 Concept, Only with the Parameter (WSD + SNOMED )

##### (1) Output the file that contains 1424 concepts for metamapping

In [12]:
with open("Results/concepts_wsd.txt.utf8",'w') as output_file:
    for key in list(df_all_metamap.ORDO_term):
        output_file.write(key+'\n')
output_file.close()

##### (2) Run the local MetaMap 
install information: https://metamap.nlm.nih.gov/Installation.shtml
###### ! ./Vocabularies/public_mm/bin/skrmedpostctl start
###### ! ./Vocabularies/public_mm/bin/wsdserverctl start
###### ! python2 Vocabularies/public_mm/call_metamap.py Results/concepts_wsd.txt.utf8 Results/concepts_wsd_out.txt.utf8 concept_metamap_result/json  Vocabularies/public_mm/bin/metamap18

In [11]:
! ./Vocabularies/public_mm/bin/skrmedpostctl start

Starting skrmedpostctl: 
started.


In [12]:
! ./Vocabularies/public_mm/bin/wsdserverctl start

Starting wsdserverctl: 
started.


In [37]:
#! python2 Vocabularies/public_mm/call_metamap.py Results/concepts_wsd.txt.utf8 Results/concepts_wsd_out.txt.utf8 Results/concepts_metamap_result/json Vocabularies/public_mm/bin/metamap18 '-AIyf -R SNOMEDCT_US -J bpoc,clas,clna,cgab,dsyn,fndg,ftcn,genf,mobd,neop,ortf,phsf,qlco,qnco,spco,tmco --JSONf 2 -V USAbase'

##### (3) Read the Metamap Result in json Format

In [17]:
## check if there is mapping in the metamap output
def mapping_exist(metamap):
    phrases = metamap["AllDocuments"][0]["Document"]["Utterances"][0]["Phrases"]
    has_mapping = False
    for phrase in phrases:
        if len(phrase["Mappings"]) != 0:
            has_mapping = True
    return has_mapping

## check if there is mapping for every phrase in the metamap output
def mapping_exist_for_all_phrases(metamap):
    phrases = metamap["AllDocuments"][0]["Document"]["Utterances"][0]["Phrases"]
    has_mapping = True
    for phrase in phrases:
        if len(phrase["Mappings"]) == 0:
            has_mapping = False
    return has_mapping

## check if a mapping has multiple candidates
def multiple_candidates(mapping):
    for candidate in mapping["MappingCandidates"]:
        if mapping["MappingScore"] != candidate["CandidateScore"]:
            #print("major problem!!!")
            #print(mapping)
            pass
    return (len(mapping["MappingCandidates"]) > 1)
## count match terms
def count_match(phrase):
    
    tokenizer = RegexpTokenizer(r'\w+')
    result = tokenizer.tokenize(phrase["PhraseText"])
    stop_words = set(stopwords.words('english'))
    new_phrase =[]
    
    for w in result:
        if w not in stop_words: new_phrase.append(w)

    num_of_words = len(new_phrase)
    
    num_of_matches = 0
    for mapping in phrase["Mappings"]:
        for candidate in mapping["MappingCandidates"]:
            num_of_matches += len(candidate["MatchedWords"])
    
    return (num_of_matches, num_of_words)

def read_metamap_result_json(file,num):
    all_concept_list = []
    all_CUI_list = []
    all_score_list = []
    all_term_list = []
    all_exist = 0
    exist = 0
    for i in range(0,num):
        file_to_read = file +'json%d.txt'%(i+1)
        with open(file_to_read) as file_handler:
            CUI_list = []
            score_list = []
            term_list = []
            file_handler.readline()
            metamap = json.load(file_handler)
            all_concept_list.append(metamap["AllDocuments"][0]["Document"]["Utterances"][0]["UttText"])
    
            if mapping_exist(metamap):
                exist += 1
            
            if mapping_exist_for_all_phrases(metamap):
                all_exist += 1
                phrases = metamap["AllDocuments"][0]["Document"]["Utterances"][0]["Phrases"]
                matches, words = 0, 0
                for phrase in phrases:
                    phrase_matches, phrase_words = count_match(phrase)
                    matches, words = matches + phrase_matches, words + phrase_words
                    
                    phrase_CUI_list = []
                    phrase_score_list = []
                    phrase_term_list = []
                    mappings = phrase["Mappings"]
                    fraxe = 0
                    for mapping in mappings:
                        candidates = mapping["MappingCandidates"]
                        for candidate in candidates:
                            if candidate["CandidateMatched"] == "FRAXE":
                                fraxe = 1
                            phrase_CUI_list.append(candidate["CandidateCUI"])
                            phrase_score_list.append(candidate["CandidateScore"])
                            phrase_term_list.append(candidate["CandidateMatched"])
                                            
                    CUI_list.append(phrase_CUI_list)
                    score_list.append(phrase_score_list)
                    term_list.append(phrase_term_list)
                    if fraxe == 1:
                        pass
                        #print(phrase_CUI_list)
                if matches/words < 0.5:
                    CUI_list = []
                    score_list = []
                    term_list = []
                
        all_CUI_list.append(CUI_list)
        all_score_list.append(score_list)
        all_term_list.append(term_list)
    return all_concept_list,all_CUI_list,all_score_list,all_term_list

In [18]:
file_path = 'Results/concepts_metamap_result/'
num_of_terms = len(os.listdir("Results/concepts_metamap_result/"))
all_concept_list,all_CUI_list,all_score_list,all_term_list = read_metamap_result_json(file_path,num_of_terms)
df_concept_metamap = pd.DataFrame({"Concepts": all_concept_list, "Matched_CUIs": all_CUI_list, "Matched_Scores": all_score_list, "Matched_Terms": all_term_list})
df_concept_metamap_1000 = df_concept_metamap[df_concept_metamap.Matched_Scores.map(lambda d: d == [['-1000']])]
print("Here is the number of concept that has 1000 score in Metamap: "+str(len(df_concept_metamap_1000.Concepts.tolist())))

Here is the number of concept that has 1000 score in Metamap: 262


##### (4) Make a dictionary of mapped concepts with corresponding cui;  Save the cuis to these concept to a file "cui_concept_list.txt"

In [19]:
mapped_cui = []
mapped_term = []
for cuis in df_concept_metamap_1000.Matched_CUIs.tolist():
    mapped_cui.append(cuis[0][0])
for terms in df_concept_metamap_1000.Matched_Terms.tolist():
    mapped_term.append(terms[0][0])
mapped_concept_cui = dict(zip(df_concept_metamap_1000.Concepts, mapped_cui))
mapped_concept_term = dict(zip(df_concept_metamap_1000.Concepts, mapped_term))
with open("Results/cui_concept_list.txt",'w') as output_file:
    for key in mapped_concept_cui:
        cui = mapped_concept_cui[key]
        output_file.write(key + '\t' + cui+'\n')
output_file.close()

### B. Metamap for the Synonyms of the rest 1156 Concept, Only with the Parameter (WSD + SNOMED )

##### (1) Output the file that only contains the synonyms of the rest 1156 concept  for metamapping

In [20]:
def get_remain_concept(df):
    remain_concept_synonyms = []
    for concept in df.ORDO_term.tolist():
        remain_concept_synonyms.append(concept)
        if concept in dict_concept_syn:
            synonyms = dict_concept_syn[concept]
            for synonym in synonyms:
                remain_concept_synonyms.append(synonym)
    return remain_concept_synonyms

In [21]:
def get_remain_synonyms(df):
    remain_concept_synonyms = []
    for concept in df.ORDO_term.tolist():
        if concept in dict_concept_syn:
            synonyms = dict_concept_syn[concept]
            for synonym in synonyms:
                remain_concept_synonyms.append(synonym)
    return remain_concept_synonyms

In [22]:
df_concept_metamap_fail = df_all_metamap[~df_all_metamap.ORDO_term.map(lambda d: d in df_concept_metamap_1000.Concepts.tolist() )]
synonyms_wsd = get_remain_synonyms(df_concept_metamap_fail)
print("Here is the total number of synonyms for metmap: "+ str(len(synonyms_wsd)))

with open("Results/synonyms_wsd.txt.utf8",'w') as output_file:
    for key in synonyms_wsd:
        output_file.write(key + '\n')
output_file.close()

Here is the total number of synonyms for metmap: 1662


##### (2) Run the local MetaMap 

###### ! python2 Vocabularies/public_mm/call_metamap.py Results/synonyms_wsd_out.txt.utf8 Results/synonyms_wsd.txt.utf8 Results/synonyms_metamap_result/json Vocabularies/public_mm/bin/metamap18 '-AIyf -R SNOMEDCT_US -J bpoc,clas,clna,cgab,dsyn,fndg,ftcn,genf,mobd,neop,ortf,phsf,qlco,qnco,spco,tmco --JSONf 2 -V USAbase'

###### argv[1] the file call metamap
###### argv[2] the file to run metamap
###### argv[3] the output overall file
###### argv[4] the where the outfiles of each term in the input file(argv[2]) stored, and named as json#
###### argv[5] the parameter for metamap



In [23]:
#! python2 Vocabularies/public_mm/call_metamap.py Results/synonyms_wsd.txt.utf8 Results/synonyms_wsd_out.txt.utf8 Results/synonyms_metamap_result/json Vocabularies/public_mm/bin/metamap18 '-AIyf -R SNOMEDCT_US -J bpoc,clas,clna,cgab,dsyn,fndg,ftcn,genf,mobd,neop,ortf,phsf,qlco,qnco,spco,tmco --JSONf 2 -V USAbase'

##### (3) Read the Metamap Result in json Format

In [24]:
file_path = 'Results/synonyms_metamap_result/'
num_of_terms = len(os.listdir('Results/synonyms_metamap_result'))
all_concept_list,all_CUI_list,all_score_list,all_term_list = read_metamap_result_json(file_path,num_of_terms)
df_synonyms_metamap = pd.DataFrame({"Concepts": all_concept_list, "Matched_CUIs": all_CUI_list, "Matched_Scores": all_score_list, "Matched_Terms": all_term_list})
df_synonyms_metamap_1000 = df_synonyms_metamap[df_synonyms_metamap.Matched_Scores.map(lambda d: d == [['-1000']])]
print("Here is the number of synonyms that has 1000 score in Metamap: "+str(len(df_synonyms_metamap_1000.Concepts.tolist())))

Here is the number of synonyms that has 1000 score in Metamap: 202


In [25]:
mapped_concept_by_syn = []
for syn in df_synonyms_metamap_1000.Concepts.tolist():
    if syn in dict_syn_concept:
        cpt = dict_syn_concept[syn]
        mapped_concept_by_syn.append(cpt)
    else:
        print("Error")
        print(syn)
mapped_concept_by_syn = list(set(mapped_concept_by_syn))
print("Here is the number of concepts that has 1000 score in Metamap with its synonyms: "+str(len(mapped_concept_by_syn)))

Here is the number of concepts that has 1000 score in Metamap with its synonyms: 138


##### (4) Make a dictionary of mapped concepts through synonyms with corresponding cui; Save the cuis to these concept to a file "cui_synonyms_list.txt"

In [26]:
mapped_cui = []
mapped_term = []
for cuis in df_synonyms_metamap_1000.Matched_CUIs.tolist():
    mapped_cui.append(cuis[0][0])
for terms in df_synonyms_metamap_1000.Matched_Terms.tolist():
    mapped_term.append(terms[0][0])
mapped_synonym_cui = dict(zip(df_synonyms_metamap_1000.Concepts, mapped_cui))
mapped_synonym_term = dict(zip(df_synonyms_metamap_1000.Concepts, mapped_term))
with open("Results/cui_synonym_list.txt",'w') as output_file:
    for key in mapped_synonym_cui:
        cui = mapped_synonym_cui[key]
        output_file.write(key + '\t' + cui+'\n')
output_file.close()

## C. Filter Out the MetMap Mapping without SNOMED_IDs

##### (1) Concat the Two cui_files

In [27]:
! cat Results/cui_concept_list.txt Results/cui_synonym_list.txt > Results/all_cui_metamap_list.txt

##### (2) Get the SNOMED_ID for each CUI

In [30]:
def UMLS_to_SNOMED(file):
    df = pd.read_csv(file, names=["Concept","UMLS_CUI"], sep='\t')
    # UMLS API requires getting TGT every 8 hours: https://documentation.uts.nlm.nih.gov/rest/authentication.html
    # Or just get every run (below)
    headers = {"Content-Type": "application/x-www-form-urlencoded"}

    user = input("Please enter your username: ")
    pw = getpass.getpass("Please enter your password: ")
    params = {"username" : user,
             "password" : pw}

    TGT_URL = "https://utslogin.nlm.nih.gov/cas/v1/tickets"

    response = (requests.post(TGT_URL, headers = headers, params = params)).text
    ticketgetter = BeautifulSoup(response, features="lxml")
    TGT = ticketgetter.form['action']

    # For service ticket request
    headers_ST = {"Content-Type": "application/x-www-form-urlencoded"}
    params = {"service": "http://umlsks.nlm.nih.gov"}

    # Check CUIs for each mapping in UMLS_mapped
    counter = 0
    Slist = []
    for i, CUI in enumerate(df['UMLS_CUI']):

        # Request UMLS concept info from API
        # Every request requires a service ticket (use TGT to get service ticket)
        ST = requests.post(TGT, headers = headers_ST, params = params)
        URL = "https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{}/atoms?sabs=SNOMEDCT_US&ticket={}".format(CUI, ST.text)
        response = requests.get(URL)

        # Check if SNOMED-CT is mapped; get SNOMED ID
        if response.text[0] == "{":     # "{" indicates start of response with AUIs; "<" indicates page not found (no AUIs)
            atom_dict = json.loads(response.text)
            atom_list = atom_dict['result']
            # Atom list contains 2 SNOMED atoms, but both are same concept/code. Can use second one which is Fully-specified name
            atom = atom_list[0]
            SNOMED_term = atom['name']
            SNOMED_code = atom['code'].rsplit('/', 1)[1]
            # Add term info to mapped_dict
            Slist.append(SNOMED_code)

        else:
            # Add term info to unmapped_dict
            Slist.append("")

        counter += 1
        if counter % 100 == 0:
            runs_left = len(df["UMLS_CUI"]) - counter
            print("{} runs left".format(runs_left))
    df["SNOMED"] = Slist
    return df
    

In [32]:
file_path = "Results/all_cui_metamap_list.txt"
df_cui_snomed_metamap =  UMLS_to_SNOMED(file_path)
df_cui_snomed_metamap.to_csv('Results/cui_snomed_metamap.txt', sep='\t')

Please enter your username: ilyia1997
Please enter your password: ········
364 runs left
264 runs left
164 runs left
64 runs left


##### (3) Read the File that has the SNOMED_ID Mapped to the CUIs

In [33]:
df_cui_snomed_metamap = pd.read_csv('Results/cui_snomed_metamap.txt',sep='\t')
del df_cui_snomed_metamap['Unnamed: 0']
df_cui_snomed_metamap[df_cui_snomed_metamap.SNOMED.isnull()]

Unnamed: 0,Concept,UMLS_CUI,SNOMED
173,Ameloblastic carcinoma,C1314678,
196,Loeffler endocarditis,C0206143,
259,Hereditary diffuse gastric cancer,C1708349,
320,Familial cerebral amyloid angiopathy,C0268393,
442,Familial Alzheimer disease,C0276496,
454,WDM,C0221054,


##### (4) Save the Mapped Concept to Mapped_Results

In [34]:
snomed_inactive = [801000000000,107691000000000,26261000000000,91521000000000]

df_cui_snomed_metamap = df_cui_snomed_metamap[df_cui_snomed_metamap.SNOMED.notnull()]
df_cui_snomed_metamap.SNOMED = df_cui_snomed_metamap.SNOMED.astype(int)

df_cui_snomed_metamap = df_cui_snomed_metamap[~df_cui_snomed_metamap.SNOMED.map(lambda d: d in snomed_inactive)]

cptsyn_with_snomed = dict(zip(df_cui_snomed_metamap.Concept, df_cui_snomed_metamap.SNOMED))
mapped_term = []
mapped_concept = []
mapped_cui = []
mapped_snomedid = []
mapped_snomedterm = []
mapped_score = []
mapped_ordoid = []
for i in range(len(df_cui_snomed_metamap.Concept.tolist())):
    mappedterm = df_cui_snomed_metamap.Concept.tolist()[i]
    if mappedterm in list(df_all_metamap.ORDO_term):
        cpt = mappedterm
    elif mappedterm in dict_syn_concept:
        cpt = dict_syn_concept[mappedterm]
    else:
        print('Error')
    ordoid = dict_concept_id[cpt]
    cui = df_cui_snomed_metamap.UMLS_CUI.tolist()[i]
    snomedid = df_cui_snomed_metamap.SNOMED.tolist()[i]
    snomedterm = snomedid_term[int(snomedid)]
    score = 1000
    mapped_term.append(mappedterm)
    mapped_concept.append(cpt)
    mapped_cui.append(cui)
    mapped_snomedid.append(snomedid)
    mapped_snomedterm.append(snomedterm)
    mapped_score.append(score)
    mapped_ordoid.append(ordoid)
df_metamap_1000_mapped = pd.DataFrame({'ORDO_ID': mapped_ordoid,
                                      'Concept': mapped_concept,
                                      'Mapped_Term': mapped_term,
                                      'SNOMED_ID': mapped_snomedid,
                                      'SNOMED_Term': mapped_snomedterm,
                                      'Score': mapped_score})
df_metamap_1000_mapped = df_metamap_1000_mapped.sort_values('ORDO_ID', ascending = False)
df_metamap_1000_mapped = df_metamap_1000_mapped.drop_duplicates(subset = 'Concept', keep = 'first')
df_metamap_1000_mapped.to_csv("Results/Mapped_Results//Mapped_Results_Metamap1000.txt",sep="\t")
print("Here is the total number of concepts that mapped by MetaMap Score of 1000: "+ str(len(df_metamap_1000_mapped.Concept.tolist())))

Here is the total number of concepts that mapped by MetaMap Score of 1000: 394


## D. For the Remaining 1029 Concepts, Fuzzy Mapping them and their Synonyms by ICD-SNOMED mapping

##### (1) Get the Concepts have ICD Mapping from the Remaining 1029 Concepts

In [38]:
concept_with_icd = df_Ordo_ICD.Concepts.tolist()
df_concept_metamap_fail = df_all_metamap[~df_all_metamap.ORDO_term.map(lambda d: d in df_metamap_1000_mapped.Concept.tolist() )]
df_fuzzymap = df_concept_metamap_fail[df_concept_metamap_fail.ORDO_term.map(lambda d: d in concept_with_icd)]
del df_fuzzymap['Unnamed: 0']
df_fuzzymap.head()
df_fuzzymap.columns = ['OR-ID','Concepts','Synonyms']
print("Here is the number of concepts from the remaining 1023 that has icd mapping: "+str(len(df_fuzzymap.Concepts.tolist())))

Here is the number of concepts from the remaining 1023 that has icd mapping: 742


##### (2) Keep the Concept with ICD-SNOMED Mapping

In [39]:
with open('ORDO_info/ICD_SNOMED_mappings.json','r') as f:
    dict_icd_snomed = json.load(f)

##### (3) Get the Fuzzy Mapping Terms 

In [40]:
ICD_list = []
for concept in df_fuzzymap['Concepts'].tolist():
    icd = concept_icd[concept]
    ICD_list.append(icd)
df_fuzzymap = df_fuzzymap.reset_index()
del df_fuzzymap['index']
df_fuzzymap['ICD-ID'] = ICD_list

df_ordo_snomed = df_fuzzymap.copy(deep=True)

ICD_SNOME_NOMAP = []
SNOMED_list=[]
count=0
ICD_list = df_ordo_snomed['ICD-ID'].tolist()
for icd_id in ICD_list:
    icd_ids = icd_id.split(';')
    snomeds=''
    for icd in icd_ids:
        if icd in dict_icd_snomed:
            snomed = dict_icd_snomed[icd]
            if snomeds != '':
                snomeds += ';'
            snomeds += ';'.join(snomed)
    
    SNOMED_list.append(snomeds)
df_ordo_snomed['SNOMED-ID'] = SNOMED_list
print(sum(df_ordo_snomed['SNOMED-ID']==''))
df_ordo_snomed_exist = df_ordo_snomed.copy(deep=True)
df_ordo_snomed_exist = df_ordo_snomed_exist[df_ordo_snomed_exist['SNOMED-ID'] != '']
df_ordo_snomed_notmap = df_ordo_snomed.copy(deep=True)
df_ordo_snomed_notmap = df_ordo_snomed_notmap[df_ordo_snomed_notmap['SNOMED-ID'] == '']

292


In [41]:
df_ordo_snomed_exist.shape

(450, 5)

##### (4) Fuzzy Mapping the 1705 terms (Concept + Synonyms)

In [42]:
description_file = "Vocabularies/SnomedCT_USEditionRF2_PRODUCTION_20190901T120000Z/Full/Terminology/sct2_Description_Full-en_US1000124_20190901.txt"
df_description_1 = pd.read_csv(description_file,sep='\t')
df_description_1 = df_description_1[df_description_1.active != 0]
df_description_1 = df_description_1[['conceptId','term']]

In [43]:
def catch_children(sid):
    sid_int = int(sid)
    source = df_relationship_is.loc[df_relationship_is['destinationId']==sid_int]['sourceId']
    return source.tolist()

def find_all_chil(snomed_ids):
    stack = LifoQueue(maxsize=9999)
    snomed_id_list = snomed_ids.split(";")
    for snomed_id in snomed_id_list:
        stack.put(snomed_id)
    everything = set()
    while not stack.empty():
        sid = stack.get()
        if sid not in everything:
            everything.add(sid)
        else:
            continue
        source = catch_children(sid)
        for item in source:
            stack.put(str(item))
    return ';'.join(everything)
def find_snomed_id(term):
    id_list = df_description_1.loc[df_description_1['term']==term,'conceptId'].tolist()
    if len(id_list) == 0:
        id_list.append(np.nan)
    return  list(set(id_list))
snomed_term_id_dic={}
def SnomedID_to_Terms(SnomedID):
    snomedIDs = SnomedID.split(";")
    terms=[]
    for snomedid in snomedIDs:
        snomedid_int = int(snomedid)
        term = df_description_1.loc[df_description_1['conceptId']==snomedid_int,'term'].tolist()
        for t in term:
            if t not in snomed_term_id_dic:
                key = []
                key.append(snomedid_int)
                snomed_term_id_dic[t] = key
                #print(t)
            else:
                #print("t in dic already")
                key = snomed_term_id_dic[t]
                #print(key)
                key.append(snomedid_int)
                key = list(set(key))
                snomed_term_id_dic[t]=key
            terms.append(t)
    return terms  

In [44]:
SNOMED_ALL=[]
for item in df_ordo_snomed_exist['SNOMED-ID'].tolist():
    SNOMED_ALL.append(find_all_chil(item))
df_ordo_snomed_exist['SNOMED_ALL'] = SNOMED_ALL



SNOMED_ALL_TERM=[]
for item in df_ordo_snomed_exist['SNOMED_ALL'].tolist():
    SNOMED_ALL_TERM.append(SnomedID_to_Terms(item))
df_ordo_snomed_exist['SNOMED_ALL_TERM'] = SNOMED_ALL_TERM

concepts_list = df_ordo_snomed_exist['Concepts'].tolist()
synonyms_list = df_ordo_snomed_exist['Synonyms'].tolist()
con_syn_list = []
for i in range(len(concepts_list)):
    concept = concepts_list[i]
    if synonyms_list[i] == '0':
        synonyms = ['']
    else:
        synonyms = synonyms_list[i].split('|')
    con_and_syn = []
    con_and_syn.append(concept)
    for j in range(len(synonyms)):
        con_and_syn.append(synonyms[j])
    con_syn_list.append(con_and_syn)
snomed_term_list= df_ordo_snomed_exist['SNOMED_ALL_TERM'].tolist()
Best_score=[]
Best_match=[]
Best_match_synorcon=[]
Best_matched_snomed_id=[]
for i in range(len(con_syn_list)):
    con_and_syn = con_syn_list[i]
    max_score = 0
    max_term = None
    con_syn_choice = None
    best_snm_id = None
    for concept in con_and_syn:
        options = snomed_term_list[i]
        for term in options:
            score = fuzz.token_sort_ratio(concept,term)
            if score >= max_score:
                max_score = score
                max_term = term
                con_syn_choice = concept
                        
    Best_score.append(max_score)
    Best_match.append(max_term)
    Best_match_synorcon.append(con_syn_choice)

df_ordo_snomed_exist['Best_match']=Best_match
df_ordo_snomed_exist['Best_score']=Best_score
df_ordo_snomed_exist['Best_match_of_choice_in_synorcon']=Best_match_synorcon
for matches in Best_match:
    if matches in snomed_term_id_dic:
        Best_matched_snomed_id.append(snomed_term_id_dic[matches])
    else:
        Best_matched_snomed_id.append(np.nan)
df_ordo_snomed_exist['Best_matched_snomed_id']=Best_matched_snomed_id
df_ordo_snomed_exist['Concepts_or_Syn'] = np.where(df_ordo_snomed_exist['Best_match_of_choice_in_synorcon']==df_ordo_snomed_exist['Concepts'], 'concepts', 'Synonyms')
df_ordo_snomed_exist.groupby('Concepts_or_Syn').count()

Unnamed: 0_level_0,OR-ID,Concepts,Synonyms,ICD-ID,SNOMED-ID,SNOMED_ALL,SNOMED_ALL_TERM,Best_match,Best_score,Best_match_of_choice_in_synorcon,Best_matched_snomed_id
Concepts_or_Syn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Synonyms,146,146,146,146,146,146,146,146,146,146,146
concepts,304,304,304,304,304,304,304,304,304,304,304


##### (5) Output the Concept that has Fuzzy score > 80

In [45]:
df_stringmatch_80 = df_ordo_snomed_exist[['OR-ID','Concepts','Best_match_of_choice_in_synorcon','Best_matched_snomed_id','Best_match','Best_score']]
df_stringmatch_80 = df_stringmatch_80[df_ordo_snomed_exist.Best_score.map(lambda d: d >=80)]
df_stringmatch_80.columns = ['ORDO-ID','Concept','Mapped_Term','SNOMED_ID','SNOMED_Term','Score']
snomed_id_list = []
for snomed in df_stringmatch_80.SNOMED_ID:
    snomed_id_list.append(snomed[0])
df_stringmatch_80['SNOMED_ID'] = snomed_id_list
df_stringmatch_80.to_csv("Results/Mapped_Results//Mapped_Results_FuzzyMap80.txt",sep="\t")
print("Here is the number of concept has fuzzy score over 80: " + str(len(snomed_id_list)))

Here is the number of concept has fuzzy score over 80: 277


# III MetaMap for Post-Coordinations 

## A. Get the remaining 750 Concepts and their Synonyms for MetaMap

In [46]:
df_concept_metamap_fail = df_all_metamap[~df_all_metamap.ORDO_term.map(lambda d: d in df_metamap_1000_mapped.Concept.tolist() )]
df_concept_stringmatch_fail = df_concept_metamap_fail[~df_concept_metamap_fail.ORDO_term.map(lambda d: d in df_stringmatch_80.Concept.tolist())]
remain_concept_synonyms = get_remain_concept(df_concept_stringmatch_fail)
print("Here is the total number of concepts and synonyms run for metamap: "+ str(len(remain_concept_synonyms)))
with open("Results/concept_n_synonyms_for_post.txt.utf8",'w') as output_file:
    for key in remain_concept_synonyms:
        output_file.write(key + '\n')
output_file.close()

Here is the total number of concepts and synonyms run for metamap: 1590


## B. Run the MetaMap

#! python2 Vocabularies/public_mm/call_metamap.py Results/concept_n_synonyms_for_post.txt.utf8 Results/concept_n_synonyms_for_post_out.txt.utf8 Results/concepts_and_synonyms_metamap_for_postCoordination_result/json Vocabularies/public_mm/bin/metamap18 '-AIiyf --threshold 500 -R SNOMEDCT_US -J bpoc,clas,clna,cgab,dsyn,fndg,ftcn,genf,mobd,neop,ortf,phsf,qlco,qnco,spco,tmco --JSONf 2 -V USAbase'

In [47]:
#! python2 Vocabularies/public_mm/call_metamap.py Results/concept_n_synonyms_for_post.txt.utf8 Results/concept_n_synonyms_for_post_out.txt.utf8 Results/concepts_and_synonyms_metamap_for_postCoordination_result/json Vocabularies/public_mm/bin/metamap18 '-AIiyf --threshold 500 -R SNOMEDCT_US -J bpoc,clas,clna,cgab,dsyn,fndg,ftcn,genf,mobd,neop,ortf,phsf,qlco,qnco,spco,tmco --JSONf 2 -V USAbase'

## C. Read the MetaMap Result, with score over 500

##### (1) Read the result of MetaMap and filter out the terms with Score Lower than 500

In [48]:
file_path = 'Results/concepts_and_synonyms_metamap_for_postCoordination_result/'
num_of_terms = len(os.listdir("Results/concepts_and_synonyms_metamap_for_postCoordination_result/"))-1
all_concept_list,all_CUI_list,all_score_list,all_term_list = read_metamap_result_json(file_path,num_of_terms)
df_metamap_post = pd.DataFrame({"Concepts": all_concept_list, "Matched_CUIs": all_CUI_list, "Matched_Scores": all_score_list, "Matched_Terms": all_term_list})
df_metamap_700 = df_metamap_post[df_metamap_post.Matched_CUIs.map(lambda d: d!=[])]

##### (2) Get the CUI list of all the terms

In [49]:
pre_post = df_metamap_700.copy(deep = True)
all_cui_pre_post = []
for i in range(len(list(pre_post.Concepts))):
    cpt = pre_post.Concepts.tolist()[i]
    cuis = pre_post.Matched_CUIs.tolist()[i]
    for phase in cuis:
        for ele in phase:
            all_cui_pre_post.append(ele)
    

all_cui_pre_post = list(set(all_cui_pre_post))
print("Here is the total number of cuis: "+ str(len(all_cui_pre_post)))

with open("Results/all_cui_prepost_list.txt",'w') as output_file:
    for key in all_cui_pre_post:
        output_file.write("term"+'\t'+key+'\n')
output_file.close()

Here is the total number of cuis: 1041


In [50]:
file_path = "Results/all_cui_prepost_list.txt"
df_cui_snomed_prepost =  UMLS_to_SNOMED(file_path)
df_cui_snomed_prepost.to_csv('Results/cui_snomed_prepost.txt', sep='\t')

Please enter your username: ilyia1997
Please enter your password: ········
941 runs left
841 runs left
741 runs left
641 runs left
541 runs left
441 runs left
341 runs left
241 runs left
141 runs left
41 runs left


In [51]:
df_cui_snomed_prepost = df_cui_snomed_prepost[df_cui_snomed_prepost.SNOMED.map(lambda d: d != '')]
cui_snomed = dict(zip(df_cui_snomed_prepost.UMLS_CUI,df_cui_snomed_prepost.SNOMED))

##### (3) Only Keep the terms that all cuis has SNOMED_ID

In [52]:
SNOMED_IDs_post = []
SNOMED_Type_post = []
SNOMED_Term_post = []
for i in range(len(list(pre_post.Concepts))):
    cpt = pre_post.Concepts.tolist()[i]
    cuis = pre_post.Matched_CUIs.tolist()[i]
    smds = []
    types = []
    terms = []
    for phase in cuis:
        smd = []
        typ = []
        term = []
        for ele in phase:
            if ele in cui_snomed:
                smd.append(int(cui_snomed[ele]))
                typ.append(snomedid_pharse[int(cui_snomed[ele])])
                term.append(snomedid_term[int(cui_snomed[ele])])
            else:
                smd = []
                typ = []
                term = []
        smds.append(smd)
        types.append(typ)
        terms.append(term)
    SNOMED_IDs_post.append(smds)
    SNOMED_Type_post.append(types)
    SNOMED_Term_post.append(terms)
pre_post = pre_post.reset_index()
del pre_post['index']
pre_post['SNOMED_ID'] = SNOMED_IDs_post
pre_post['SNOMED_Term'] = SNOMED_Term_post
pre_post['Semantic_Type'] = SNOMED_Type_post
pre_post_no_snomed = pre_post[pre_post.SNOMED_ID.map(lambda d: [] in d)]
pre_post = pre_post[pre_post.SNOMED_ID.map(lambda d: [] not in d)]
term_snomedid = dict(zip(pre_post.Concepts,pre_post.SNOMED_ID))
term_score = dict(zip(pre_post.Concepts,pre_post.Matched_Scores))
snomedid_type = dict(zip(pre_post.Concepts,pre_post.Semantic_Type))
term_term = dict(zip(pre_post.Concepts,pre_post.SNOMED_Term))
print("Here is the number of terms that has snomed mapping: "+ str(len(pre_post.SNOMED_ID.tolist())))

Here is the number of terms that has snomed mapping: 1163


In [None]:
pre_post_no_snomed.shape()

##### (4) Prepare the Data for Post-Coordination

In [53]:
term_post = pre_post.Concepts.tolist()
concept_for_post ={}
concept_type = {}
concept_choice = {}
term_choice = {}
concept_score = {}
for term in term_post:
    if term in dict_syn_concept:
        concept = dict_syn_concept[term]
    elif term in df_concept_stringmatch_fail.ORDO_term.tolist():
        concept = term
    else:
        print("Error")
    max_score = 0
    max_snomed = None
    choice_term = None
    sterm = None
    if concept not in concept_for_post:
        score = int(term_score[term][0][0][1:])
        snomed = term_snomedid[term]
        max_score = score
        max_snomed = snomed
        choice_term = term
        sterm = term_term[term]
        concept_for_post[concept] = max_snomed
        concept_type[concept] = snomedid_type[term]
        concept_choice[concept] = choice_term
        term_choice[concept] = sterm
        concept_score[concept] = max_score
    else:
        score = int(term_score[term][0][0][1:])
        snomed = term_snomedid[term]
        if score > max_score:
            max_score = score
            max_snomed = snomed
            choice_term = term
            sterm = term_term[term]
            concept_for_post[concept] = max_snomed
            concept_type[concept] = snomedid_type[term]
            concept_choice[concept] = choice_term
            term_choice[concept] = sterm
            concept_score[concept] = max_score
    
Choice = []
Type = []
Term_choice = []
STerm = []
Mscore = []
for concept in df_concept_stringmatch_fail.ORDO_term.tolist():
    if concept in concept_for_post:
        Choice.append(concept_for_post[concept])
        Type.append(concept_type[concept])
        Term_choice.append(concept_choice[concept])
        STerm.append(term_choice[concept])
        Mscore.append(concept_score[concept])
    else:
        Choice.append(np.nan)
        Type.append(np.nan)
        Term_choice.append(np.nan)
        STerm.append(np.nan)
        Mscore.append(np.nan)
        
post = pd.DataFrame({'Concepts':df_concept_stringmatch_fail.ORDO_term.tolist(),'Mapped_Score': Mscore,'Mapped_Terms':Term_choice , "SNOMED_Terms": STerm,"SNOMED_ID": Choice , 'Semantic_Type':Type})
post = post[post.SNOMED_ID.notnull()]
post.shape

(664, 6)

##### (5) Get the Post-coordination Terms only mapped to 'Disease (disorder)' and filter them out

In [54]:
post_remove = post[post.SNOMED_Terms.map(lambda d: d  == [['Disease (disorder)']])]
post = post[~post.SNOMED_Terms.map(lambda d: d  == [['Disease (disorder)']])]
post_remove.shape

(27, 6)

##### (6) Get the Post-coordination Terms that only mapped to one term and filter them out

In [55]:
domain_set={'(disorder)' ,'(morphologic abnormality)','(finding)','(observable entity)'}
post_single = post[post.SNOMED_Terms.map(lambda d: len(d) == 1 )]
post_single = post_single[post_single.SNOMED_Terms.map(lambda d: len(d[0]) == 1)]
post = post[post.SNOMED_Terms.map(lambda d: len(d) != 1 or len(d[0]) != 1)]
post_single.shape

(97, 6)

In [56]:
post_single_keep = post_single[post_single.Mapped_Score.map(lambda d: d > 900)]
post_single_keep = post_single_keep[post_single_keep.Semantic_Type.map(lambda d: d[0][0] in domain_set)]
post_single_keep.shape

(21, 6)

In [57]:
Ordo_id = []
for cpt in post_single_keep.Concepts.tolist():
    if cpt in dict_concept_id:
        Ordo_id.append(dict_concept_id[cpt])
    else:
        print("Error")

snomed = []
for snomedid in post_single_keep.SNOMED_ID.tolist():
    snomed.append(int(snomedid[0][0]))
post_single_keep.SNOMED_ID = snomed
snomedt = []
for snomedid in post_single_keep.SNOMED_Terms.tolist():
    snomedt.append(snomedid[0][0])
post_single_keep.SNOMED_Terms = snomedt
        
df_stringmatch_900 = pd.DataFrame({'ORDO-ID': Ordo_id, 
                                      'Concept':post_single_keep.Concepts.tolist(),
                                     'Mapped_Term': post_single_keep.Mapped_Terms.tolist(),
                                     'SNOMED_ID': post_single_keep.SNOMED_ID.tolist(),
                                     'SNOMED_Term': post_single_keep.SNOMED_Terms.tolist(),
                                     'Score': post_single_keep.Mapped_Score.tolist()})

In [58]:
df_stringmatch_900.to_csv('Results/Mapped_Results/Mapped_MetaMap900.txt',sep='\t')

##### (7) Do the Post-Corrdination on the remaining 67 terms

In [59]:
def find_qualifier(tp):
    idx = []
    for i in range(len(tp)):
        if tp[i] in qualifier_set:
            idx.append(i)
    return idx
def find_site(tp):
    idx = []
    for i in range(len(tp)):
        if tp[i] in findingsite_set:
            idx.append(i)
    return idx
def find_attribute(tp):
    idx = []
    for i in range(len(tp)):
        if tp[i] in attribute_set:
            idx.append(i)
    return idx
def find_domain(tp):
    idx = []
    for i in range(len(tp)):
        if tp[i] in domain_set:
            idx.append(i)
    return idx
def nearest_domain(tdx, sdx):
    st = []
    for t in tdx:
        d = [abs(s-t) for s in sdx]
        min_d = min(d)
        st.append(sdx[d.index(min_d)])
    return st
def concat(parts):
    return '+'.join(parts)
def find_post(cpt, sid, st, tp):
    if len(find_domain(tp)) == 0:
        return "Not Found"
    
    parts = []
    site_index = find_site(tp)
    qualifier_index = find_qualifier(tp)
    attribute_index = find_attribute(tp)
    domain_index = find_domain(tp)
    
    site_add_l = [""]*len(tp)
    if len(site_index) > 0:
        site_add = nearest_domain(site_index, domain_index)
        for i in range(len(site_add)):
            site_add_l[site_add[i]] += ":363698007|Finding Site|="+str(sid[site_index[i]]) + '|' + st[site_index[i]] + '|'
     
    qualifier_add_l = [""]*len(tp)
    if len(qualifier_index) > 0:
        qualifier_add = nearest_domain(qualifier_index, domain_index)
        for i in range(len(qualifier_add)):
            qualifier_add_l[qualifier_add[i]] += ":362981000|Qualifier Value|="+str(sid[qualifier_index[i]]) + '|' + st[qualifier_index[i]] + '|'
    
    attribute_add_l = [""]*len(tp)
    if len(attribute_index) > 0:
        attribute_add = nearest_domain(attribute_index, domain_index)
        for i in range(len(attribute_add)):
            attribute_add_l[attribute_add[i]] += ":246061005|Attribute|="+str(sid[attribute_index[i]]) + '|' + st[attribute_index[i]] + '|'
    
    all_domain_terms = []
    for i in range(len(domain_index)):
        domain_terms = str(sid[domain_index[i]]) + '|' + st[domain_index[i]] + '|'
        domain_terms += site_add_l[domain_index[i]]
        domain_terms += qualifier_add_l[domain_index[i]]
        domain_terms += attribute_add_l[domain_index[i]]
        all_domain_terms.append(domain_terms)
    return "+".join(all_domain_terms)

In [60]:
domain_set={'(disorder)' ,'(morphologic abnormality)','(finding)','(observable entity)'}
attribute_set = {'(attribute)': 246061005}
qualifier_set = {'(qualifier value)': 362981000}
findingsite_set = {'(body structure)': 363698007}

post_domain = post[post.Semantic_Type.map(lambda d: domain_set.intersection(set([t for s in d for t in s])) != None)]

post_domain.columns=['Concept','Score','Mapped_Terms','SNOMED_Terms','SNOMED_IDs','Semantic_Type']

all_type=[]
cpt_post={}
for i in range(len(post_domain.SNOMED_IDs.tolist())):
    cpt = post_domain.Concept.tolist()[i]
    snid = post_domain.SNOMED_IDs.tolist()[i]
    sterm = post_domain.SNOMED_Terms.tolist()[i]
    semtype = post_domain.Semantic_Type.tolist()[i]
    sid = []
    st = []
    tp = []
    for snids in snid:
        for ele in snids:
            sid.append(ele)
    for terms in sterm:
        for term in terms:
            st.append(term)
    for stypes in semtype:
        for ele in stypes:
            tp.append(ele)
            all_type.append(ele)
    if cpt not in cpt_post:
        cpt_post[cpt] = find_post(cpt, sid, st, tp)
    else:
        print('Error')

In [61]:
Post_Coordination = []
Ordo_id = []
for cpt in post_domain.Concept.tolist():
    if cpt in dict_concept_id:
        Ordo_id.append(dict_concept_id[cpt])
    else:
        print("Error")
    if cpt in cpt_post:
        Post_Coordination.append(cpt_post[cpt])
    else:
        print("Error")
df_post_coordination = pd.DataFrame({'ORDO-ID': Ordo_id, 
                                      'Concept':post_domain.Concept.tolist(),
                                     'Mapped_Term': post_domain.Mapped_Terms.tolist(),
                                     'SNOMED_ID': post_domain.SNOMED_IDs.tolist(),
                                     'SNOMED_Term': Post_Coordination,
                                     'Score': post_domain.Score.tolist()})
df_post_coordination = df_post_coordination[df_post_coordination.SNOMED_Term.map(lambda d: d != 'Not Found')]

In [62]:
df_post_coordination.to_csv('Results/Mapped_Results/Mapped_PostCoordiniation500.txt',sep='\t')

In [63]:
df_post_coordination.shape

(510, 6)

# Concatenation of all the output

In [76]:
df_post_coordination.head(1)

df_stringmatch_900.head(1)

df_stringmatch_80.head(1)

df_metamap_1000_mapped.head(1)

df_Mapped_ORDO_SNOMED_Direct.head(1)

df_Mapped_ORDO_SNOMED_UMLS.head(1)

frames = [df_Mapped_ORDO_SNOMED_Direct,df_Mapped_ORDO_SNOMED_UMLS,df_metamap_1000_mapped,df_stringmatch_80,df_stringmatch_900,df_post_coordination]
All_Mapped_ORDO_SNOMED = pd.concat(frames,sort=False)
del All_Mapped_ORDO_SNOMED['Score']
All_Mapped_ORDO_SNOMED = All_Mapped_ORDO_SNOMED.reset_index()
del All_Mapped_ORDO_SNOMED['index']
All_Mapped_ORDO_SNOMED.to_csv("Results/Mapped_Results/Mapped_ALL_ORDO_SNOMED.txt", sep="\t")