In [8]:
from collections import defaultdict
from tqdm import trange
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification
from openai import OpenAI

import numpy as np
import json
import re

In [2]:
tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu


In [3]:
path = '/data1/angela/MSLAB-Research/MIMC/0.1_10.log'
data = json.load(open(path,"r"))

In [6]:
index = 60
print(data['gt'][index])
print("-"*35+"[Ground Truth]"+"-"*35)
gt_dict = dict()
for ent in pipe(data['gt'][index]):
    gt_dict[ent['entity_group']] = ent['word']
    print(ent['word'], ",", ent['entity_group'])
print("") 
print(data['pred'][index])
print("-"*35+"[Prediction]"+"-"*35)
pred_dict = dict()
for ent in pipe(data['pred'][index]):
    pred_dict[ent['entity_group']] = ent['word']
    print(ent['word'], ",", ent['entity_group'])

66-year-old female with history of hypertension, nonobstructive cardiomyopathy with diastolic dysfunction, status post catheterization.
-----------------------------------[Ground Truth]-----------------------------------
66 - year - old , Age
female , Sex
hyper , History
nonobstructive cardiomyopathy , History
dia , History
status , History

66-year-old female with hypertension, nonobstructive cardiomyopathy with diastolic dysfunction, post catheterization.
-----------------------------------[Prediction]-----------------------------------
66 - year - old , Age
female , Sex
hyper , History
nonobstructive cardiomyopathy , History
dia , Detailed_description


In [4]:
def find_age(word):
    template = re.compile(r'[0-9]+')
    age = template.findall(word)
    if len(age) != 0:
        return age[0]
    else:
        return -1

def find_gender(word):
    if word == "man" or word == "gentleman" or word == "gentlemen" or word == "boy" or  word == "dad" or word == "male":
        return "male"
    if word == "woman" or word == "female" or word == "lady" or word == "mother" or  word == "girl":
        return "female"
    return "none"

def synonym(word1, word2):
    client = OpenAI()

    completion = client.chat.completions.create(
        model="gpt-3.5-turbo",
        temperature=0.8,
        messages=[
            {"role": "system", "content": "You are a useful assistant."},
            {"role": "user", "content": "If two words or sentences are same, please output yes. If two words or sentences have the similar meaning, please output yes."},
            {"role": "user", "content": word1},
            {"role": "user", "content": word2},
        ]
    )

    ans = completion.choices[0].message.content

    return ans

In [29]:
w1 = "##patic cirrhosis"
w2 = "##patic cirrhosis"
ans = synonym(w1, w2)
print(ans)
print(("Yes" in ans) or ("yes" in ans))

Yes.
True


In [None]:
hit = defaultdict(list)

for index in trange(len(data['gt'])):
    gt_dict = dict()
    for ent in pipe(data['gt'][index]):
        gt_dict[ent['entity_group']] = ent['word']

    pred_dict = dict()
    for ent in pipe(data['pred'][index]):
        pred_dict[ent['entity_group']] = ent['word']

    for k, v in gt_dict.items():
        if k == "Age" and k in pred_dict:
            gt_age = find_age(v)
            if gt_age == -1:
                continue
            if find_age(pred_dict[k]) == gt_age:
                hit[k].append(1)
            else:
                hit[k].append(0)
        elif k == "Sex" and k in pred_dict:
            gt_sex = find_gender(v)
            if gt_sex == "None":
                continue
            if find_gender(pred_dict[k]) == gt_sex:
                hit[k].append(1)
            else:
                hit[k].append(0)
        elif (k == "Biological_structure" or k == "Sign_symptom" or k == "Disease_disorder" or k == "History") and k in pred_dict:
            ans = synonym(v, pred_dict[k])
            if ("Yes" in ans) or ("yes" in ans):
                hit[k].append(1)
            else:
                hit[k].append(0)

In [29]:
for k in hit.keys():
    print(f"{k}, {np.mean(hit[k]):.4f}")

Age, 0.9492
Sex, 0.9731
History, 0.4271
Detailed_description, 0.4740
Clinical_event, 0.7196
Sign_symptom, 0.4950
Lab_value, 0.4270
Subject, 0.6184
Biological_structure, 0.7390
Disease_disorder, 0.5307
Medication, 0.5758
Therapeutic_procedure, 0.4491
Occupation, 0.3784
Personal_background, 0.6133
Nonbiological_location, 0.6385
Activity, 0.4701
Duration, 0.6891
Date, 0.5540
Frequency, 0.3182
Coreference, 0.3800
Distance, 0.5789
Diagnostic_procedure, 0.4473
Severity, 0.6729
Family_history, 0.4474
Time, 0.5714
Quantitative_concept, 0.3182
Qualitative_concept, 0.3333
Outcome, 0.0000
Color, 0.3333
Dosage, 0.1429
Volume, 0.5000
Area, 0.0000
Personal_[back](Biological_structure, 0.0000
Other_event, 0.4000
Administration, 1.0000
Texture, 1.0000
