## RAG for Medical Question Answering

In [22]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, BertModel

from openai import OpenAI
import os
from dotenv import load_dotenv

In [19]:
from openai import OpenAI
import os
from dotenv import load_dotenv

In [2]:
path = '../0__Documents/Documents/'

In [3]:
# load BERT tokenizer (uncased)
transformer_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(transformer_name)

# load pretrained BERT model
model = BertModel.from_pretrained(transformer_name)

# assign device (cuda if possible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# load to device
model = model.to(device)

In [4]:
def getBertCls(text):
  '''
  this function takes the following input:
  text to be represented by the BERT CLS token
  and gives the following output:
  a numpy array representing the text
  '''
  tok_text = tokenizer(text[:512],
                       return_tensors='pt').to(device)
  mod_output = model(**tok_text,
                     output_hidden_states=True)
  last_hidden_states = mod_output.hidden_states[-1]
  return last_hidden_states[:,0,:].cpu().detach().numpy()[0]

In [13]:
def calCosSim(emb1, emb2):
  '''
  return the cosine similarity of the
  2 input numpy array
  '''
  result = emb1 @ emb2.T
  result /= (np.linalg.norm(emb1)*np.linalg.norm(emb2))
  return result

## Sepsis

In [5]:
Q_Sepsis = "Among patients with septic shock and relative adrenal insufficiency, do corticosteroids reduce 28-day mortality?"
Q_Sepsis_emb = getBertCls(Q_Sepsis)

In [6]:
SepsisDocNames = ['doc_8.txt', 'doc_6.txt', 'doc_9.txt', 'doc_11.txt', 'doc_13.txt']
SepsisDocs = list()
for SDN in SepsisDocNames:
    with open(path+SDN, "r") as file:
        content = file.read()
    SepsisDocs.append(content)

In [None]:
df1_sepsis = pd.DataFrame(zip(SepsisDocNames[:1], SepsisDocs[:1]), columns=['Document Name', 'Document'])
df2_sepsis = pd.DataFrame(zip(SepsisDocNames[:2], SepsisDocs[:2]), columns=['Document Name', 'Document'])
df3_sepsis = pd.DataFrame(zip(SepsisDocNames[:3], SepsisDocs[:3]), columns=['Document Name', 'Document'])
df4_sepsis = pd.DataFrame(zip(SepsisDocNames[:4], SepsisDocs[:4]), columns=['Document Name', 'Document'])
df5_sepsis = pd.DataFrame(zip(SepsisDocNames[:5], SepsisDocs[:5]), columns=['Document Name', 'Document'])

Unnamed: 0,Document Name,Document
0,doc_8.txt,Abstract\nContext Septic shock may be associat...
1,doc_6.txt,Abstract\nBackground\nWhether hydrocortisone r...
2,doc_9.txt,Abstract\nBackground\nSeptic shock is characte...
3,doc_11.txt,Abstract\nBackground\nHydrocortisone is widely...
4,doc_13.txt,Abstract\nImportance Adjunctive hydrocortison...


In [14]:
df1_sepsis['DocEmb'] = df1_sepsis['Document'].apply(getBertCls)
df2_sepsis['DocEmb'] = df2_sepsis['Document'].apply(getBertCls)
df3_sepsis['DocEmb'] = df3_sepsis['Document'].apply(getBertCls)
df4_sepsis['DocEmb'] = df4_sepsis['Document'].apply(getBertCls)
df5_sepsis['DocEmb'] = df5_sepsis['Document'].apply(getBertCls)

In [15]:
df1_sepsis['Sim'] = df1_sepsis['DocEmb'].apply(lambda x: calCosSim(Q_Sepsis_emb, x))
df2_sepsis['Sim'] = df2_sepsis['DocEmb'].apply(lambda x: calCosSim(Q_Sepsis_emb, x))
df3_sepsis['Sim'] = df3_sepsis['DocEmb'].apply(lambda x: calCosSim(Q_Sepsis_emb, x))
df4_sepsis['Sim'] = df4_sepsis['DocEmb'].apply(lambda x: calCosSim(Q_Sepsis_emb, x))
df5_sepsis['Sim'] = df5_sepsis['DocEmb'].apply(lambda x: calCosSim(Q_Sepsis_emb, x))

In [17]:
Sepsis_dfs = [df1_sepsis, df2_sepsis, df3_sepsis, df4_sepsis, df5_sepsis]
Sepsis_Results = pd.DataFrame(columns=['NumberOfDocs', 'Document Name', 'Document', 'Sim'])

for df_s in Sepsis_dfs:
    df_s['NumberOfDocs'] = df_s.shape[0]
    df_s = df_s.sort_values(by='Sim', ascending=False).head(1)
    df_s = df_s[['NumberOfDocs', 'Document Name', 'Document', 'Sim']]
    Sepsis_Results = pd.concat([Sepsis_Results, df_s])

  Sepsis_Results = pd.concat([Sepsis_Results, df_s])


In [None]:
Sepsis_Results

Unnamed: 0,NumberOfDocs,Document Name,Document,Sim
0,1,doc_8.txt,Abstract\nContext Septic shock may be associat...,0.759227
0,2,doc_8.txt,Abstract\nContext Septic shock may be associat...,0.759227
0,3,doc_8.txt,Abstract\nContext Septic shock may be associat...,0.759227
0,4,doc_8.txt,Abstract\nContext Septic shock may be associat...,0.759227
4,5,doc_13.txt,Abstract\nImportance Adjunctive hydrocortison...,0.772474


In [23]:
# get key
load_dotenv()
client = OpenAI(
    api_key = os.environ.get("OPENAI_API_KEY"),
)

In [27]:
def GetAnswerSepsis(row):
    document = row['Document']
    prompt = Q_Sepsis + f"\n\nPlease answer the question based on the following information:: \n {document}"
    try:
        completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
        {"role": "system", "content": prompt}
        ]
        )
        return(completion.choices[0].message.content)
    except:
        return None

Sepsis_Results['Answer'] = Sepsis_Results.apply(GetAnswerSepsis, axis=1)

In [28]:
Sepsis_Results

Unnamed: 0,NumberOfDocs,Document Name,Document,Sim,Answer
0,1,doc_8.txt,Abstract\nContext Septic shock may be associat...,0.759227,"Yes, corticosteroids do reduce 28-day mortalit..."
0,2,doc_8.txt,Abstract\nContext Septic shock may be associat...,0.759227,"Yes, corticosteroids do reduce 28-day mortalit..."
0,3,doc_8.txt,Abstract\nContext Septic shock may be associat...,0.759227,"Yes, corticosteroids do reduce 28-day mortalit..."
0,4,doc_8.txt,Abstract\nContext Septic shock may be associat...,0.759227,"Yes, corticosteroids do reduce 28-day mortalit..."
4,5,doc_13.txt,Abstract\nImportance Adjunctive hydrocortison...,0.772474,The study referenced in the abstract does not ...


In [29]:
def GetRatingSepsis(ans):
    GroundTruth = "Among patients with septic shock and relative adrenal insufficiency, \
        administration of corticosteroids reduces 28-day mortality, although this finding \
        was not confirmed in the follow-up CORTICUS, HYPRESS, or ADRENAL trials, but confirmed in APROCCHSS."

    prompt = "Based on the reference information above, on a scale from 0 to 1, how well do you think generated answer\
             compare to the reference? Keep your answer concise." 
    
    try:
        completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
        {"role": "system", "content": GroundTruth + "\n\n" + prompt},
        {"role": "user", "content": ans}
        ]
        )
        return(completion.choices[0].message.content)
    except:
        return None
Sepsis_Results['GPT-4Rating'] = Sepsis_Results['Answer'].apply(GetRatingSepsis)

In [44]:
Sepsis_Results.to_csv('../3__Output/Sepsis_Results_RAG_11292024.csv', index = False)
Sepsis_Results

Unnamed: 0,NumberOfDocs,Document Name,Document,Sim,Answer,GPT-4Rating
0,1,doc_8.txt,Abstract\nContext Septic shock may be associat...,0.759227,"Yes, corticosteroids do reduce 28-day mortalit...",0.7
0,2,doc_8.txt,Abstract\nContext Septic shock may be associat...,0.759227,"Yes, corticosteroids do reduce 28-day mortalit...",0.5
0,3,doc_8.txt,Abstract\nContext Septic shock may be associat...,0.759227,"Yes, corticosteroids do reduce 28-day mortalit...",0.8
0,4,doc_8.txt,Abstract\nContext Septic shock may be associat...,0.759227,"Yes, corticosteroids do reduce 28-day mortalit...",0.8
4,5,doc_13.txt,Abstract\nImportance Adjunctive hydrocortison...,0.772474,The study referenced in the abstract does not ...,0.5


## ARDS

In [31]:
Q_ARDS = "In patients with moderate to severe ARDS, does the early use of continuous neuromuscular blockade with cisatracurium improve mortality when used with current light sedation protocols?"
Q_ARDS_emb = getBertCls(Q_ARDS)

In [32]:
ARDSDocNames = ['doc_5.txt', 'doc_0.txt', 'doc_1.txt', 'doc_2.txt', 'doc_3.txt']
ARDSDocs = list()

for ADN in ARDSDocNames:
    with open(path+ADN, "r") as file:
        content = file.read()
    ARDSDocs.append(content)

In [33]:
df1_ARDS = pd.DataFrame(zip(ARDSDocNames[:1], ARDSDocs[:1]), columns=['Document Name', 'Document'])
df2_ARDS = pd.DataFrame(zip(ARDSDocNames[:2], ARDSDocs[:2]), columns=['Document Name', 'Document'])
df3_ARDS = pd.DataFrame(zip(ARDSDocNames[:3], ARDSDocs[:3]), columns=['Document Name', 'Document'])
df4_ARDS = pd.DataFrame(zip(ARDSDocNames[:4], ARDSDocs[:4]), columns=['Document Name', 'Document'])
df5_ARDS = pd.DataFrame(zip(ARDSDocNames[:5], ARDSDocs[:5]), columns=['Document Name', 'Document'])

In [34]:
df1_ARDS['DocEmb'] = df1_ARDS['Document'].apply(getBertCls)
df2_ARDS['DocEmb'] = df2_ARDS['Document'].apply(getBertCls)
df3_ARDS['DocEmb'] = df3_ARDS['Document'].apply(getBertCls)
df4_ARDS['DocEmb'] = df4_ARDS['Document'].apply(getBertCls)
df5_ARDS['DocEmb'] = df5_ARDS['Document'].apply(getBertCls)

In [35]:
df1_ARDS['Sim'] = df1_ARDS['DocEmb'].apply(lambda x: calCosSim(Q_ARDS_emb, x))
df2_ARDS['Sim'] = df2_ARDS['DocEmb'].apply(lambda x: calCosSim(Q_ARDS_emb, x))
df3_ARDS['Sim'] = df3_ARDS['DocEmb'].apply(lambda x: calCosSim(Q_ARDS_emb, x))
df4_ARDS['Sim'] = df4_ARDS['DocEmb'].apply(lambda x: calCosSim(Q_ARDS_emb, x))
df5_ARDS['Sim'] = df5_ARDS['DocEmb'].apply(lambda x: calCosSim(Q_ARDS_emb, x))

In [36]:
ARDS_dfs = [df1_ARDS, df2_ARDS, df3_ARDS, df4_ARDS, df5_ARDS]
ARDS_Results = pd.DataFrame(columns=['NumberOfDocs', 'Document Name', 'Document', 'Sim'])

for df_a in ARDS_dfs:
    df_a['NumberOfDocs'] = df_a.shape[0]
    df_a = df_a.sort_values(by='Sim', ascending=False).head(1)
    df_a = df_a[['NumberOfDocs', 'Document Name', 'Document', 'Sim']]
    ARDS_Results = pd.concat([ARDS_Results, df_a])

  ARDS_Results = pd.concat([ARDS_Results, df_a])


In [37]:
ARDS_Results

Unnamed: 0,NumberOfDocs,Document Name,Document,Sim
0,1,doc_5.txt,Abstract\nBackground\nThe benefits of early co...,0.823358
1,2,doc_0.txt,Abstract\nBackground\nIn patients undergoing m...,0.857619
1,3,doc_0.txt,Abstract\nBackground\nIn patients undergoing m...,0.857619
3,4,doc_2.txt,Background\nOptimal fluid management in patien...,0.861967
3,5,doc_2.txt,Background\nOptimal fluid management in patien...,0.861967


In [38]:
def GetAnswerARDS(row):
    document = row['Document']
    prompt = Q_ARDS + f"\n\nPlease answer the question based on the following information:: \n {document}"
    try:
        completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
        {"role": "system", "content": prompt}
        ]
        )
        return(completion.choices[0].message.content)
    except:
        return None

ARDS_Results['Answer'] = ARDS_Results.apply(GetAnswerARDS, axis=1)

In [39]:
ARDS_Results

Unnamed: 0,NumberOfDocs,Document Name,Document,Sim,Answer
0,1,doc_5.txt,Abstract\nBackground\nThe benefits of early co...,0.823358,The early use of continuous neuromuscular bloc...
1,2,doc_0.txt,Abstract\nBackground\nIn patients undergoing m...,0.857619,"Yes, the early use of continuous neuromuscular..."
1,3,doc_0.txt,Abstract\nBackground\nIn patients undergoing m...,0.857619,"Yes, the early use of continuous neuromuscular..."
3,4,doc_2.txt,Background\nOptimal fluid management in patien...,0.861967,The provided background information focuses on...
3,5,doc_2.txt,Background\nOptimal fluid management in patien...,0.861967,The provided information does not directly add...


In [40]:
def GetRatingARDS(ans):
    GroundTruth = "Not only does early neuromuscular blockade not reduce mortality when using modern \
                    light sedation protocols, it was associated with an increase in ICU-acquired weakness \
                    and serious adverse cardiovascular events."

    prompt = "Based on the reference information above, on a scale from 0 to 1, how well do you think generated answer\
             compare to the reference? Keep your answer concise." 
    
    try:
        completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
        {"role": "system", "content": GroundTruth + "\n\n" + prompt},
        {"role": "user", "content": ans}
        ]
        )
        return(completion.choices[0].message.content)
    except:
        return None
ARDS_Results['GPT-4Rating'] = ARDS_Results['Answer'].apply(GetRatingARDS)

In [43]:
ARDS_Results

Unnamed: 0,NumberOfDocs,Document Name,Document,Sim,Answer,GPT-4Rating
0,1,doc_5.txt,Abstract\nBackground\nThe benefits of early co...,0.823358,The early use of continuous neuromuscular bloc...,0.8
1,2,doc_0.txt,Abstract\nBackground\nIn patients undergoing m...,0.857619,"Yes, the early use of continuous neuromuscular...",0
1,3,doc_0.txt,Abstract\nBackground\nIn patients undergoing m...,0.857619,"Yes, the early use of continuous neuromuscular...",0. The generated answer contradicts the refere...
3,4,doc_2.txt,Background\nOptimal fluid management in patien...,0.861967,The provided background information focuses on...,0.
3,5,doc_2.txt,Background\nOptimal fluid management in patien...,0.861967,The provided information does not directly add...,0.


In [None]:
ARDS_Results.to_csv('../3__Output/ARDS_Results_RAG_11292024.csv', index = False)