In [355]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from concurrent.futures import ThreadPoolExecutor
import pickle

In [2]:
df = pd.read_csv('patients.csv')
df.dropna(inplace = True)

**BERT question answering**

In [181]:
# initialize model and tokenizer
bert_model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
bert_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

In [298]:
def bert_qa(question, text):
    '''
    Takes two strings: the first is the question string, the second is the text it searches for the answer. 
    Returns what it thinks the answer is.
    '''
    inputs = bert_tokenizer.encode_plus(question, text, add_special_tokens = True, return_tensors = 'pt')
    input_ids = inputs["input_ids"].tolist()[0]
    
    text_tokens = bert_tokenizer.convert_ids_to_tokens(input_ids)
    sep_index = text_tokens.index('[SEP]') + 1
    answer_start_scores, answer_end_scores = bert_model(**inputs)
    
    if torch.max(answer_start_scores) and torch.max(answer_end_scores) < 5:
        return(np.nan)
    
    answer_start = torch.argmax(answer_start_scores[0][sep_index:])
    answer_end = torch.argmax(answer_end_scores[0][sep_index:]) + 1
    answer_start += sep_index 
    answer_end += sep_index
    
    answer = bert_tokenizer.convert_tokens_to_string(bert_tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    
    return(answer)

**BioBERT question answering**

In [191]:
# initialize model and tokenizer
biobert_model = AutoModelForQuestionAnswering.from_pretrained("gdario/biobert_bioasq")
biobert_tokenizer = AutoTokenizer.from_pretrained("gdario/biobert_bioasq")

In [342]:
def biobert_qa(question, text):
    '''
    Takes two strings: the first is the question string, the second is the text it searches for the answer. 
    Returns what it thinks the answer is.
    '''
    inputs = biobert_tokenizer.encode_plus(question, text, add_special_tokens = True, return_tensors = 'pt')
    input_ids = inputs["input_ids"].tolist()[0]
    
    text_tokens = biobert_tokenizer.convert_ids_to_tokens(input_ids)
    sep_index = text_tokens.index('[SEP]') + 1
    answer_start_scores, answer_end_scores = biobert_model(**inputs)
    
    if torch.max(answer_start_scores) and torch.max(answer_end_scores) < 1:
        return(np.nan)
    
    answer_start = torch.argmax(answer_start_scores[0][sep_index:])
    answer_end = torch.argmax(answer_end_scores[0][sep_index:]) + 1
    answer_start += sep_index 
    answer_end += sep_index
    
    answer = biobert_tokenizer.convert_tokens_to_string(biobert_tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    
    return(answer)

In [201]:
bert_qa('Does the patient have any allergies?', 'The patient agreed to the surgery. And likely will update his presentation. No known allergies.')

('no known allergies', 5.076744556427002, 2.9227135181427)

**Looping through the data**

In [None]:
questions = {'covid': ['Does the patient have covid-19?', 'Does the patient have coronavirus?', 'Does the patient have corona?'],
            'age': ['How old is the patient?', 'What is the patient’s age?', 'How many yo is the patient?'],
            'sex': ['Is the patient male or female?', 'Is the patient a woman?', 'Is the patient a boy?', 'Is the patient m or f?'],
            'consent': ['Has the patient consented?', 'Has the patient agreed?'],
            'bmi': ['What is the patient’s bmi?', 'What is the patient’s body mass index?', 'Is the patient obese?'],
            'height': ['How tall is the patient?', 'What is the patient’s height?', 'How many ft is the patient?'],
            'weight': ['How much does the patient weigh?', 'How many lbs is the patient?', 'How many kg is the patient?', 'What is the patient’s weight?'],
            'allergy': ['Does the patient have any allergies?', 'Is the patient allergic?', 'Has the patient had any allergic reactions?'],
            'hiv': ['Does the patient have hiv?', 'Does the patient have human immunodeficiency virus?'],
            'hepatitis': ['Does the patient have hepatitis?', 'Does the patient have any problems with their liver?', 'Does the patient suffer from any hepatic diesease?'],
            'alt_ast': ['Does the patient have ALT/AST > 4 times ULN?', 'How does patient’s ALT/AST compare to upper normal limit?', 'How does patient’s ALT/AST compare to ULN?'],
            'opiods': ['Does the patient use opioids?', 'Is the patient using any sedatives?', 'Is the patient using any painkillers?']}

In [326]:
questions = {'consent': ['Has the patient consented?'],
            'allergy': ['Does the patient have any allergies?'],
            'hiv': ['Does the patient have HIV?'],
            'hepatitis': ['Does the patient have liver problems?'],
            'opiods': ['Does the patient use opioids?']}

In [327]:
bert_utility_tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
biobert_utility_tokenizer = AutoTokenizer.from_pretrained("gdario/biobert_bioasq")

In [328]:
results = pd.DataFrame(columns = ['patient_id', 'consent', 'allergy', 'hiv', 'hepatitis', 'opiods'])

**BERT**

In [337]:
for patient, note, i in zip(df['patient_id'], df['notes'], range(len(df))):
    print('Questioning patient', patient)
    # ============ For notes greater than 450 tokens long ================
    # if the note length is greater than the 450 tokens we'll split it and work on each split separately
    note_tokenized = bert_utility_tokenizer.tokenize(note)
    note_length = len(note_tokenized)
    if note_length > 450:
        # split the data into =<450-sized chunks
        split_n = (note_length // 450) + 1
        split_start = 0
        split_end = 450
        split_text = bert_utility_tokenizer.convert_tokens_to_string(note_tokenized[split_start:split_end])
        
        # initialize scorer
        scorer = {'patient_id':[patient], 'consent':[[]], 'allergy':[[]], 'hiv':[[]], 'hepatitis':[[]],
                 'opiods':[[]]}
        for criteria, index in zip(questions, range(len(questions))):
            if index == 0:
                continue
            for question in questions.get(criteria):
                mention = bert_qa(question, split_text)
                if not pd.isna(mention):
                    scorer[criteria][0].append(mention)   
        
#         for n in range(split_n):
#             # if we're in the last split
#             if n == (split_n - 1):
#                 split_text = bert_utility_tokenizer.convert_tokens_to_string(note_tokenized[note_length - (note_length % 450): ])
#                 for criteria, index in zip(questions, range(len(questions))):
#                     if index == 0:
#                         continue
#                     for question in questions.get(criteria):
#                         mention = bert_qa(question, split_text)
#                         if not pd.isna(mention):
#                             scorer[criteria][0].append(mention)
#             # otherwise, for each successive split
#             elif n != 0:
#                 split_start += 450
#                 split_end += 450
#                 split_text = bert_utility_tokenizer.convert_tokens_to_string(note_tokenized[split_start:split_end])
#                 for criteria, index in zip(questions, range(len(questions))):
#                     if index == 0:
#                         continue
#                     for question in questions.get(criteria):
#                         mention = bert_qa(question, split_text)
#                         if not pd.isna(mention):
#                             scorer[criteria][0].append(mention)
#             # for the first split
#             elif n == 0:
#                 for criteria, index in zip(questions, range(len(questions))):
#                     if index == 0:
#                         continue
#                     for question in questions.get(criteria):
#                         mention = bert_qa(question, split_text)
#                         if not pd.isna(mention):
#                             scorer[criteria][0].append(mention)
        # record results
        scorer_df = pd.DataFrame(scorer)
        results = results.append(scorer_df, ignore_index = True)
        print('Added results for patient', patient)
        if i % 25 == 0:
            print('checkpoint!')
            results.to_csv('checkpoint_results.csv')
    # =========== For notes less than 450 tokens long ===================
    elif note_length <= 450:
        # set up the scorer dict
        scorer = {'patient_id':[patient], 'consent':[[]], 'allergy':[[]], 'hiv':[[]], 'hepatitis':[[]],
                 'opiods':[[]]}
        for criteria, index in zip(questions, range(len(questions))):
            if index == 0:
                continue
            for question in questions.get(criteria):
                mention = bert_qa(question, note)
                if not pd.isna(mention):
                    scorer[criteria][0].append(mention)

        # record results in the dictionary
        scorer_df = pd.DataFrame(scorer)
        results = results.append(scorer_df, ignore_index = True)
        print('Added results for patient', patient)
        if i % 25 == 0:
            print('checkpoint!')
            results.to_csv('checkpoint_results.csv')

Questioning patient 1081.0
Added results for patient 1081.0
checkpoint!
Questioning patient 1080.0
Added results for patient 1080.0
Questioning patient 1079.0
Added results for patient 1079.0
Questioning patient 1078.0
Added results for patient 1078.0
Questioning patient 1077.0
Added results for patient 1077.0
Questioning patient 1076.0
Added results for patient 1076.0
Questioning patient 1075.0
Added results for patient 1075.0
Questioning patient 1074.0
Added results for patient 1074.0
Questioning patient 1073.0
Added results for patient 1073.0
Questioning patient 1072.0
Added results for patient 1072.0
Questioning patient 1071.0
Added results for patient 1071.0
Questioning patient 1070.0
Added results for patient 1070.0
Questioning patient 1069.0
Added results for patient 1069.0
Questioning patient 1068.0
Added results for patient 1068.0
Questioning patient 1067.0
Added results for patient 1067.0
Questioning patient 1066.0
Added results for patient 1066.0
Questioning patient 1065.0
A

In [349]:
results.rename(columns = {'patient_id':'id', 'consent':'c4_qa', 'allergy':'c8_qa', 'hiv':'c9_qa', 'hepatitis':'c10_qa', 'opiods':'c12_qa'},
              inplace = True)

In [350]:
results.to_csv('bert_qa_results.csv', index = False)

In [361]:
results

Unnamed: 0,id,c4_qa,c8_qa,c9_qa,c10_qa,c12_qa
0,1081.0,[],[],[],[],[]
1,1080.0,[],[clindamycin],[],[],[]
2,1079.0,[],[codeine / percocet / erythromycin base / aspi...,[],[],[]
3,1078.0,[],[codeine / percocet / erythromycin base / aspi...,[],[],[]
4,1077.0,[],[no known drug allergies],[],[],[]
...,...,...,...,...,...,...
1076,5.0,[],[],[],[],[]
1077,4.0,[],[no known drug allergies],[],[],[]
1078,3.0,[],[],[],[],[]
1079,2.0,[],[],[],[],[]


In [356]:
with open('bert_qa_results.pkl', 'wb') as f:
    pickle.dump(results, f)

In [357]:
with open('bert_qa_results.pkl', 'rb') as f:
    test = pickle.load(f)

In [360]:
test['c8_qa'][1]

['clindamycin']

In [351]:
test = pd.read_csv('bert_qa_results.csv')

In [354]:
test['c8_qa'][1]

"['clindamycin']"

**BMI stuff**

In [343]:
bmi_results = pd.DataFrame(columns = ['patient_id', 'bmi', 'height', 'weight'])
bmi_questions = questions = {
            'bmi': ['What is the patient’s body mass index?', 'Is the patient obese?'],
            'height': ['What is the patient’s height?'],
            'weight': ['What is the patient’s weight?']}

In [344]:
for patient, note, i in zip(df['patient_id'], df['notes'], range(len(df))):
    print('Questioning patient', patient)
    # ============ For notes greater than 450 tokens long ================
    # if the note length is greater than the 450 tokens we'll split it and work on each split separately
    note_tokenized = biobert_utility_tokenizer.tokenize(note)
    note_length = len(note_tokenized)
    if note_length > 450:
        # split the data into =<450-sized chunks
        split_n = (note_length // 450) + 1
        split_start = 0
        split_end = 450
        split_text = biobert_utility_tokenizer.convert_tokens_to_string(note_tokenized[split_start:split_end])
        
        # initialize scorer
        scorer = {'patient_id':[patient], 'bmi':[[]], 'height':[[]], 'weight':[[]]}  
        
        for n in range(split_n):
            # if we're in the last split
            if n == (split_n - 1):
                split_text = biobert_utility_tokenizer.convert_tokens_to_string(note_tokenized[note_length - (note_length % 450): ])
                for criteria, index in zip(bmi_questions, range(len(bmi_questions))):
                    if index == 0:
                        continue
                    for question in bmi_questions.get(criteria):
                        mention = biobert_qa(question, split_text)
                        if not pd.isna(mention):
                            scorer[criteria][0].append(mention)
            # otherwise, for each successive split
            elif n != 0:
                split_start += 450
                split_end += 450
                split_text = biobert_utility_tokenizer.convert_tokens_to_string(note_tokenized[split_start:split_end])
                for criteria, index in zip(bmi_questions, range(len(bmi_questions))):
                    if index == 0:
                        continue
                    for question in bmi_questions.get(criteria):
                        mention = biobert_qa(question, split_text)
                        if not pd.isna(mention):
                            scorer[criteria][0].append(mention)
            # for the first split
            elif n == 0:
                for criteria, index in zip(bmi_questions, range(len(bmi_questions))):
                    if index == 0:
                        continue
                    for question in bmi_questions.get(criteria):
                        mention = biobert_qa(question, split_text)
                        if not pd.isna(mention):
                            scorer[criteria][0].append(mention)
        # record results
        scorer_df = pd.DataFrame(scorer)
        bmi_results = bmi_results.append(scorer_df, ignore_index = True)
        print('Added results for patient', patient)
        if i % 25 == 0:
            print('checkpoint!')
            bmi_results.to_csv('bmi_checkpoint_results.csv')
    # =========== For notes less than 450 tokens long ===================
    elif note_length <= 450:
        # set up the scorer dict
        scorer = {'patient_id':[patient], 'bmi':[[]], 'height':[[]], 'weight':[[]]}  
        for criteria, index in zip(bmi_questions, range(len(bmi_questions))):
            if index == 0:
                continue
            for question in bmi_questions.get(criteria):
                mention = biobert_qa(question, note)
                if not pd.isna(mention):
                    scorer[criteria][0].append(mention)

        # record results in the dictionary
        scorer_df = pd.DataFrame(scorer)
        bmi_results = bmi_results.append(scorer_df, ignore_index = True)
        print('Added results for patient', patient)
        if i % 25 == 0:
            print('checkpoint!')
            bmi_results.to_csv('bmi_checkpoint_results.csv')

Questioning patient 1081.0
Added results for patient 1081.0
checkpoint!
Questioning patient 1080.0
Added results for patient 1080.0
Questioning patient 1079.0
Added results for patient 1079.0
Questioning patient 1078.0
Added results for patient 1078.0
Questioning patient 1077.0
Added results for patient 1077.0
Questioning patient 1076.0
Added results for patient 1076.0
Questioning patient 1075.0
Added results for patient 1075.0
Questioning patient 1074.0
Added results for patient 1074.0
Questioning patient 1073.0
Added results for patient 1073.0
Questioning patient 1072.0
Added results for patient 1072.0
Questioning patient 1071.0
Added results for patient 1071.0
Questioning patient 1070.0
Added results for patient 1070.0
Questioning patient 1069.0
Added results for patient 1069.0
Questioning patient 1068.0
Added results for patient 1068.0
Questioning patient 1067.0
Added results for patient 1067.0
Questioning patient 1066.0
Added results for patient 1066.0
Questioning patient 1065.0
A

KeyboardInterrupt: 

In [345]:
test_case = ['loss of height  5mm nodule',
 'loss of height suspicious for metastatic',
 'vertebral body height of l1 and',
 'loss of height minimal disc space',
 'vertebral body height  tspine plain',
 'maintain normal height multilevel degenerative endplate',
 'loss of height is similar in',
 'lv function height in 66 weight',
 '9884 nsr1409220 height 5 weight 244lbs',
 'left 13082 height 62 weight 4lb',
 'of disc height there is no',
 'examination her height is 5 feet',
 'physical exam height 6 feet 3',
 'mildly atrophic height 68 inches weight',
 'and a height of 5 feet',
 'on presentation height was 5 feet',
 'and her height was 5 2',
 'left 13795 height 64 weight 200',
 '90 pounds height is 61 inches',
 'on presentation height of 5 feet',
 'heart disease height in 69 weight',
 'rate 20 height 52 weight 141',
 'aortic aneurysm height in 60 weight',
 'patient is height was 5 feet',
 '11 osteopenia height loss of 3',
 'cabg procedure height in 58 weight',
 'on admission height is 5 feet',
 'left 15590 height 57 weight 150',
 'examination her height was 5 feet',
 'left 12884 height five feet eight',
 'on admission height 59 weight 104',
 '48 kg height 5 foot 3',
 '12476 left height 5 weight 259',
 '99 kilograms height 5 feet 10',
 'vital signs height 5 ft 7',
 'of vertebral height on',
 'pulse 65 height 5 weight 210',
 'aorta repair height in 75 weight',
 'and body height was preserved hospital',
 'intraop tee height in 61 weight',
 'for avrcabg height in 67 weight',
 'pressure 18285 height 5 feet 10',
 'appropriate carinal height on',
 'mid rca height in 67 weight',
 'presentation her height was 5 feet',
 'on presentation height was 6 feet',
 'of breath height in 67 weight',
 'ventricular function height in 67 weight',
 'sinus rhythm height 59 inch general',
 'physical examination height 6 feet weight']

In [347]:
for text in test_case:
    print(bert_qa('What is the height?', text))

5mm
nan
l1
nan
vertebral body height
nan
nan
nan
height 5
62
nan
5 feet
6 feet 3
68 inches
5 feet
5 feet
5 2
64
61 inches
5 feet
heart disease
52
aortic aneurysm
nan
nan
nan
5 feet
height 57
5 feet
five feet eight
59
5 foot 3
5
5 feet 10
5 ft 7
nan
5
aorta repair height in 75 weight
nan
nan
nan
5 feet 10
nan
nan
5 feet
6 feet
nan
nan
59 inch general
6 feet
