In [1]:
from quickumls import QuickUMLS
import pandas as pd
import tqdm


In [2]:
matcher = QuickUMLS("./umls-data/out", threshold = 0.7)

In [3]:
final = pd.read_csv("final_mimic.csv")
len(final)

220429

In [4]:
#keep only the subject id, hadm id, and notes
final = final[["subject_id", "hadm_id", "text"]]

#drop duplicates because original data has multiple prescriptions for each patient in same admission or same stay
#we only want to keep distinctive notes
final = final.drop_duplicates()
len(final)

14953

In [5]:
#get all notes
all_notes = []
for i in tqdm.tqdm(range(len(final))):
    temp_dict = {}
    temp_dict["subject_id"] = note = final.iloc[i]["subject_id"]
    temp_dict["hadm_id"] = note = final.iloc[i]["hadm_id"]

    text = final.iloc[i]["text"]
    temp_dict["text"] = text

    temp_dict["matched_results"] = matcher.match(text, best_match=True, ignore_syntax=False)

    all_notes.append(temp_dict)
    
len(all_notes)

100%|██████████| 14953/14953 [7:34:51<00:00,  1.83s/it]      


14953

In [7]:
all_notes[0]

{'subject_id': np.int64(10002013),
 'hadm_id': np.int64(21763296),
 'matched_results': [[{'start': 11718,
    'end': 11741,
    'ngram': 'oxycodone-acetaminophen',
    'term': 'oxycodone-acetaminophen',
    'cui': 'C0717368',
    'similarity': 1.0,
    'semtypes': {'T121'},
    'preferred': 1}],
  [{'start': 12490,
    'end': 12512,
    'ngram': 'Level of Consciousness',
    'term': 'Level of Consciousness',
    'cui': 'C0234425',
    'similarity': 1.0,
    'semtypes': {'T033'},
    'preferred': 1},
   {'start': 12490,
    'end': 12512,
    'ngram': 'Level of Consciousness',
    'term': 'MMSE - Level of Consciousness',
    'cui': 'C4526510',
    'similarity': 0.7407407407407407,
    'semtypes': {'T170'},
    'preferred': 1},
   {'start': 12490,
    'end': 12512,
    'ngram': 'Level of Consciousness',
    'term': 'Level of consciousness',
    'cui': 'C4050479',
    'similarity': 0.7391304347826086,
    'semtypes': {'T201'},
    'preferred': 0},
   {'start': 12490,
    'end': 12512,
    

In [None]:
import pickle

with open("all_notes_with_quickumls_results.pkl", "wb") as file:
    pickle.dump(all_notes, file)

## Example quickumls

In [None]:
example_sentences = [
    "He had a CXR that demonstrated possible left base consolidation",
    "She did not have fevers or chills until the day prior to admission when she noted chills",
    "At least moderate pulmonary hypertension",
    "He was then brsought to the [**Hospital118**] ED for further management",
    "Metastatic osteogenic sarcoma"
]
for text in example_sentences:
    print(text)
    print("\n")
    print(matcher.match(text, best_match=True, ignore_syntax=False))
    print("\n\n")


He had a CXR that demonstrated possible left base consolidation


[[{'start': 31, 'end': 39, 'ngram': 'possible', 'term': 'possible', 'cui': 'C0332149', 'similarity': 1.0, 'semtypes': {'T033'}, 'preferred': 1}], [{'start': 9, 'end': 12, 'ngram': 'CXR', 'term': 'cxr', 'cui': 'C0039985', 'similarity': 1.0, 'semtypes': {'T060'}, 'preferred': 1}], [{'start': 50, 'end': 63, 'ngram': 'consolidation', 'term': 'Consolidation', 'cui': 'C0521530', 'similarity': 0.8333333333333334, 'semtypes': {'T047'}, 'preferred': 1}], [{'start': 18, 'end': 30, 'ngram': 'demonstrated', 'term': 'Not demonstrated', 'cui': 'C4697740', 'similarity': 0.7142857142857143, 'semtypes': {'T033'}, 'preferred': 1}]]



She did not have fevers or chills until the day prior to admission when she noted chills


[[{'start': 57, 'end': 66, 'ngram': 'admission', 'term': 'admission', 'cui': 'C0184666', 'similarity': 1.0, 'semtypes': {'T058'}, 'preferred': 1}, {'start': 57, 'end': 66, 'ngram': 'admission', 'term': 'admissions', 'c