In [1]:
import os
import openai
import langchain
from langchain.prompts import ChatPromptTemplate
from langchain.vectorstores import FAISS
from langchain.document_loaders import CSVLoader
from langchain.embeddings.base import Embeddings
import pandas as pd
import tiktoken
import json
import warnings
warnings.filterwarnings("ignore")

openai_api_key = os.environ['OPENAI_API_KEY']
client = openai.OpenAI(
    api_key=openai_api_key,
    base_url="https://cmu.litellm.ai",
)

In [2]:
class CustomOpenAIEmbeddings(Embeddings):
    def __init__(self, client):
        self.client = client

    def embed_documents(self, texts):
        embeddings = []
        counter = 0
        for text in texts:
            if(counter%500==0):
                print(counter)
            counter +=1
            response = self.client.embeddings.create(input=text, model="text-embedding-3-small")
            embedding = response.data[0].embedding
            embeddings.append(embedding)
        return embeddings

    def embed_query(self, text):
        response = self.client.embeddings.create(input=text, model="text-embedding-3-small")
        return response.data[0].embedding

embedding_model = CustomOpenAIEmbeddings(client)

In [3]:
cpt_vector_store = FAISS.load_local("CPT_index", embedding_model, allow_dangerous_deserialization=True)
icd_vector_store = FAISS.load_local("ICD_index", embedding_model, allow_dangerous_deserialization=True)
ndc_vector_store = FAISS.load_local("NDC_index", embedding_model, allow_dangerous_deserialization=True)

In [4]:
def conversation_to_ehr(conversation, client, embedding_model, cpt_vector_store, icd_vector_store, ndc_vector_store):

    prompt_template = ChatPromptTemplate.from_template("""Write as a professional medical scribe, ensuring medical accuracy, clarity, and brevity. Go through the following doctor-patient conversation and create a SOAP note for it. A SOAP note consists of Subjective, Objective, Assessment and Plan sections. Just include these 4 sections and nothing else in the note. For each subfield in each of the four sections return a list of items in decreasing order of importance. If you do not have information for a particular field return an empty list. \
    1. ‘Subjective’ section includes items taken during the patient's verbal exam. Include 'Chief complaint', 'History of present illness', and 'Past social history' as subfields. \
    2. ’Objective’ section includes findings from the physical examinations and diagnostics taken prior to the visit, including laboratory or imaging results, broken down by exam type. It should have the following subfields: ’Vital signs’, ’Physical exam findings’, ’Laboratory data’, ’Imaging results’, and ’Other diagnostic data’. If a specific exam type is not mentioned, return an empty list for that subfield. \
    3. ’Assessment’ includes the doctor’s diagnosis as a list in the subfield ’Diagnosis’ in decreasing order of importance. \
    4. ’Plan’ section contains planned ’Tests’, ’Referrals’, ’Medications’ along with ’Instructions’ as separate subfields. ’Medications’ should contain a list of prescribed medications with a dictionary for each containing medication ’Name’, ’Dosage’, ’Route’, and ’Frequency’. If no medication is mentioned, the 'Medications' should return an empty list. \
    If you do not have data for a particular section or a sub-section, return an empty list for that particular subfield. Ensure that the medical terminology used in the conversation is accurately reflected. SOAP note should be concise, and avoid adding details not explicitly mentioned in the conversation. Format the output as JSON with the keys: ’Subjective’, ’Objective’, ’Assessment’ and ’Plan’. For each of these sections create dictionaries within for the different subfields. \
    \
    ### \
    Conversation: {conversation}""")

    logs = ""
    
    formatted_prompt = prompt_template.format(conversation=conversation)
    response = client.chat.completions.create(
      model="gpt-4o-mini", temperature=0, seed=42,
      messages=[{"role": "user", "content": formatted_prompt}])

    logs = logs + "Tokens used for SOAP generation: " + str(response.usage.prompt_tokens) + ", " + str(response.usage.completion_tokens) + ", " + str(response.usage.total_tokens) + "\n\n"

    total_prompt_tokens = int(response.usage.prompt_tokens)
    total_completion_tokens = int(response.usage.completion_tokens)
    total_tokens = int(response.usage.total_tokens)
    
    response_json = response.choices[0].message.content.strip('```json').strip('```')
    soap_note = json.loads(response_json)
    soap_note['Codes'] = {}
    soap_note['Codes']['Tests'] = {}
    soap_note['Codes']['Medications'] = {}
    soap_note['Codes']['Diagnosis'] = {}

    ############################ RAG for Tests ################################

    logs = logs + " ############################ RAG for Tests ################################\n\n"
    
    for tests in soap_note['Plan']['Tests']:
    
        soap_note['Codes']['Tests'][tests] = []
        tests_prompt = "Find the CPT codes corresponding to the following test: " + tests
        tests_retriever = cpt_vector_store.as_retriever()
        tests_retrieved_docs = tests_retriever.get_relevant_documents(tests_prompt)
        tests_joined_docs = ""
        for doc in tests_retrieved_docs:
            tests_joined_docs = tests_joined_docs + " ### " + doc.page_content
        tests_prompt_rag = ChatPromptTemplate.from_template("""{tests_prompt} Instruction: Only use the following returned documents to get the CPT codes. Ensure matching is case-insensitive. If there are multiple codes possible, return all of them. If no useful codes are found, just return 'N/A'. Output the codes as a comma-separated list in order of confidence without spaces. Here are the relevant documents: {tests_joined_docs}""")
        tests_prompt_rag = tests_prompt_rag.format(tests_prompt=tests_prompt, tests_joined_docs=tests_joined_docs)

        logs = logs + tests_prompt_rag + "\n\n"

        tests_response = client.chat.completions.create(
          model="gpt-4o-mini",temperature=0,seed=42,messages=[
            {"role": "user", "content": tests_prompt_rag}])

        total_prompt_tokens += int(tests_response.usage.prompt_tokens)
        total_completion_tokens += int(tests_response.usage.completion_tokens)
        total_tokens += int(tests_response.usage.total_tokens)
        logs = logs + "Tokens used for Tests: " + str(tests_response.usage.prompt_tokens) + ", " + str(tests_response.usage.completion_tokens) + ", " + str(tests_response.usage.total_tokens) + "\n\n"

        tests_codes = tests_response.choices[0].message.content
        
        if('N/A' in tests_codes.strip()):
            continue
        else:
            tests_codes_list = tests_codes.strip().split(",")
            for code in tests_codes_list:
                code = code.strip()
                soap_note['Codes']['Tests'][tests].append(str(code))

    ############################ RAG for Diagnosis ################################

    logs = logs + " ############################ RAG for Diagnosis ################################\n\n"
    
    for diagnosis in soap_note['Assessment']['Diagnosis']:
    
        soap_note['Codes']['Diagnosis'][diagnosis] = []
        diagnosis_prompt = "Find the ICD-10 code corresponding to the following diagnosis: " + diagnosis
        diagnosis_retriever = icd_vector_store.as_retriever()
        diagnosis_retrieved_docs = diagnosis_retriever.get_relevant_documents(diagnosis_prompt)
        diagnosis_joined_docs = ""
        for doc in diagnosis_retrieved_docs:
            diagnosis_joined_docs = diagnosis_joined_docs + " ### " + doc.page_content
        diagnosis_prompt_rag = ChatPromptTemplate.from_template("""{diagnosis_prompt} Instruction: Only use the following returned documents to get the ICD codes. Ensure matching is case-insensitive. If there are multiple codes possible, return all of them. If no useful codes are found, just return 'N/A'. Output the codes as a comma-separated list in order of confidence without spaces. Here are the relevant documents: {diagnosis_joined_docs}""")
        diagnosis_prompt_rag = diagnosis_prompt_rag.format(diagnosis_prompt=diagnosis_prompt, diagnosis_joined_docs=diagnosis_joined_docs)
        
        logs = logs + diagnosis_prompt_rag + "\n\n"
        
        diagnosis_response = client.chat.completions.create(
          model="gpt-4o-mini", temperature=0, seed=42,
          messages=[{"role": "user", "content": diagnosis_prompt_rag}])

        total_prompt_tokens += int(diagnosis_response.usage.prompt_tokens)
        total_completion_tokens += int(diagnosis_response.usage.completion_tokens)
        total_tokens += int(diagnosis_response.usage.total_tokens)
        logs = logs + "Tokens used for Diagnosis: " + str(diagnosis_response.usage.prompt_tokens) + ", " + str(diagnosis_response.usage.completion_tokens) + ", " + str(diagnosis_response.usage.total_tokens) + "\n\n"

        diagnosis_codes = diagnosis_response.choices[0].message.content
        diagnosis_codes_list = diagnosis_codes.strip().split(",")
        for code in diagnosis_codes_list:
            code = code.strip()
            soap_note['Codes']['Diagnosis'][diagnosis].append(str(code))

    ############################ RAG for Medications ################################
    
    for drug_entry in soap_note['Plan']['Medications']:

        logs = logs + " ############################ RAG for Medications ################################\n\n"
    
        drugs = drug_entry["Name"] + ", " + drug_entry["Dosage"]
        soap_note['Codes']['Medications'][drugs] = []
        drugs_prompt = "Find the NDC codes corresponding to the following drug: " + drugs
        drugs_retriever = ndc_vector_store.as_retriever()
        drugs_retrieved_docs = drugs_retriever.get_relevant_documents(drugs_prompt)
        drugs_joined_docs = ""
        for doc in drugs_retrieved_docs:
            drugs_joined_docs = drugs_joined_docs + " ### " + doc.page_content
        drugs_prompt_rag = ChatPromptTemplate.from_template("""{drugs_prompt} Instruction: Only use the following returned documents to get the NDC codes. Ensure matching is case-insensitive. If there are multiple codes possible, return all of them. If no useful codes are found, just return 'N/A'. Output the codes as a comma-separated list in order of confidence without spaces. Here are the relevant documents: {drugs_joined_docs}""")
        drugs_prompt_rag = drugs_prompt_rag.format(drugs_prompt=drugs_prompt, drugs_joined_docs=drugs_joined_docs)
        
        logs = logs + drugs_prompt_rag + "\n\n"
        
        drugs_response = client.chat.completions.create(
          model="gpt-4o-mini",temperature=0,seed=42,
          messages=[{"role": "user", "content": drugs_prompt_rag}])
        
        total_prompt_tokens += int(drugs_response.usage.prompt_tokens)
        total_completion_tokens += int(drugs_response.usage.completion_tokens)
        total_tokens += int(drugs_response.usage.total_tokens)
        logs = logs + "Tokens used for Drugs: " + str(drugs_response.usage.prompt_tokens) + ", " + str(drugs_response.usage.completion_tokens) + ", " + str(drugs_response.usage.total_tokens) + "\n\n"

        drugs_codes = drugs_response.choices[0].message.content
        drugs_codes_list = drugs_codes.strip().split(",")
        for code in drugs_codes_list:
            code = code.strip()
            soap_note['Codes']['Medications'][drugs].append(str(code))
        
        logs = logs + " ############################ Finished ################################\n\n"
        logs = logs + "Total Tokens used: " + str(total_prompt_tokens) + ", " + str(total_completion_tokens) + ", " + str(total_tokens)

    return soap_note, logs, total_prompt_tokens, total_completion_tokens, total_tokens

In [41]:
conversation = """[doctor] hey george how are you today i understand you're here for some numbness and tingling in your fingers and some pain in your wrist [patient] right my left wrist and hand has been bothering me probably for a few months now with pain and numbness [doctor] okay and you said that's been ongoing for several months do you know what caused this type of pain or is it just something that started slowly or [patient] it just kinda started on it's own it i notice it mostly at night [doctor] okay [patient] sometimes it will i'll wake up and my hands asleep and i got ta shake it out [doctor] shake it out and okay [patient] and then some [doctor] what kind of work do you do [patient] i do yard work [doctor] yard work [patient] landscaping landscaping [doctor] landscaping okay so a lot of raking a lot of digging so a lot of repetitive type movements [patient] yeah it's pretty heavy labor but it's yeah the same thing day in and day out [doctor] okay okay just a couple questions for you you did say that you have the pain at night in that and you have to you get that numbness into the hand is it in all the fingers [patient] yeah it seems to happen to all my fingers but i notice it more in my thumb and pointer finger [doctor] okay okay and anything into that little into your fifth finger your little finger any numbness there at times no [patient] sometimes yeah it seems like it's numb too [doctor] okay what about your right hand any problems with that hand [patient] no i do n't seem to have any problems with my right hand so far it's just mostly my left [doctor] okay okay good and just a couple you know do you how do you have many or do you drink often do you have you know many any alcohol consumption [patient] i drink usually a a beer or two on fridays and saturdays on the weekends [doctor] okay and do you have any evidence of any anybody ever said that you had some rheumatoid arthritis in your hand or wrist anything like that [patient] no nobody say anything like that so i mean [doctor] okay okay good so let me go ahead and do a physical exam here real quick and you know i'm gon na quickly just listen to your heart and lungs okay that's good i'd like you to squeeze i'm gon na hold your hands here and i'd like you to squeeze both hands [patient] okay [doctor] you seem a little bit weaker on that left hand is that what you've noticed [patient] yeah i i i experienced some weakness in my left hand [doctor] okay do you you find that you're dropping things when you're picking it up is it to that level or [patient] yeah i drop things mostly because i have a hard time feeling it [doctor] okay okay good and so you you do have a a grip strength is less on the left and i just wan na touch your fingers here on the on the right side you can feel me touching all the fingers on the right [patient] yeah i can i can say you touch me but it feels a little more weird on the thumb side than my pointer finger side [doctor] okay okay and i wan na turn your wrist over here and turn your hand over and i'm gon na go ahead and tap on the right wrist on the back here does that do anything when i do that [patient] i still i feel a little jolt or a zing in my finger tips [doctor] okay and then when i do that on the left side [patient] yeah same thing [doctor] same thing okay so you do have a bilateral positive tinel's sign so so here's here's where i'm at i think your your diagnosis is beginning to have some bilateral carpal tunnel syndrome usually we see that with repetitive actions such as the landscaping the heavy labor and you you know your your clinical exam and and history sound like it's a carpal tunnel syndrome i do want to order so where are we gon na go from here i would like to order a a study it's called an emg where it it measures some of that electrical impulses down into your fingers we will follow up with that but as far as your treatment so the treatment for carpal tunnel syndrome is really some activity modification now i know you are a landscaper is there any way that you could be work to have some lighter work during the time [patient] i suppose i could try to pass it off to some of my other employes and delegate [doctor] okay that would be good so that's i i just want you to kinda eliminate that the active repetitive motions that you're doing all the time just for a couple weeks i'm also gon na give you a wrist splint to wear and that should help and i'd like you to take ibuprofen six hundred milligrams every six hours and then i wan na see you back here in the office in two weeks and in that two week period i think we're gon na see if there's need for any other intervention if i need to do more diagnostic testing or if there is a possibly looking at a surgical intervention to release that pressure that's on the nerves in that hand does that sound like a a good plan for you [patient] yeah it sounds like a good first start [doctor] okay okay so i i just just off off the record here what kind of what do what do you specialize in landscaping is your company do [patient] mostly like yard work and maintenance flower beds not really designing just up keep [doctor] okay yeah i'm looking for a landscape designer i need somebody to put in some elaborate walkways back through the backyard so yeah we can do stuff like that i mean if you have an idea what you want i think that's easy [patient] okay [doctor] you know if you're looking for like some [patient] backyard elasis rehab remodel that's i mean i suppose we could do we have n't done things like that in a while because we're busy enough with just the up key but it's something to explore [doctor] okay yeah i may have to keep that in mind because i do wan na do some of that so let's listen i'm gon na get my my nurse in here to discharge you do you have any other questions for me before we end this [patient] no i think it's all clear i appreciate it [doctor] okay take care and i'll look forward to see you in two weeks [patient] very good appreciate your time"""

soap_note, logs, total_prompt_tokens, total_completion_tokens, total_tokens = conversation_to_ehr(conversation, client, embedding_model, cpt_vector_store, icd_vector_store, ndc_vector_store)

In [42]:
print(soap_note)

{'Subjective': {'Chief complaint': ['Numbness and tingling in left fingers', 'Pain in left wrist'], 'History of present illness': ['Symptoms have been ongoing for a few months', 'Noticed mostly at night', 'Wakes up with hands asleep, needs to shake them out', 'Weakness in left hand, difficulty feeling objects, drops things'], 'Past social history': ['Works in landscaping, involving heavy labor and repetitive movements', 'Occasional alcohol consumption (1-2 beers on weekends)']}, 'Objective': {'Vital signs': [], 'Physical exam findings': ['Weaker grip strength in left hand', "Bilateral positive Tinel's sign", 'Altered sensation in left thumb and pointer finger'], 'Laboratory data': [], 'Imaging results': [], 'Other diagnostic data': []}, 'Assessment': {'Diagnosis': ['Bilateral carpal tunnel syndrome']}, 'Plan': {'Tests': ['Order EMG to measure electrical impulses in fingers'], 'Referrals': [], 'Medications': [{'Name': 'Ibuprofen', 'Dosage': '600 mg', 'Route': 'Oral', 'Frequency': 'Every

In [43]:
print(logs)

Tokens used for SOAP generation: 1728, 348, 2076

 ############################ RAG for Tests ################################

Human: Find the CPT codes corresponding to the following test: Order EMG to measure electrical impulses in fingers Instruction: Only use the following returned documents to get the CPT codes. Ensure matching is case-insensitive. If there are multiple codes possible, return all of them. If no useful codes are found, just return 'N/A'. Output the codes as a comma-separated list in order of confidence without spaces. Here are the relevant documents:  ### CODE: 96002
DESCRIPTION: Dynamic surface emg ### CODE: 96003
DESCRIPTION: Dynamic fine wire emg ### CODE: 97032
DESCRIPTION: Electrical stimulation ### CODE: G0283
DESCRIPTION: Elec stim other than wound

Tokens used for Tests: 149, 2, 151

 ############################ RAG for Diagnosis ################################

Human: Find the ICD-10 code corresponding to the following diagnosis: Bilateral carpal tunnel

In [44]:
print(total_prompt_tokens, total_completion_tokens, total_tokens)

2191 367 2558


## Testing on first 5 examples from aci bench

In [5]:
df = pd.read_csv("./data/train_aci_asrcorr.csv")
df.head()

Unnamed: 0,dataset,id,dialogue,note
0,aci,ACI001,[doctor] hey george how are you today i unders...,CHIEF COMPLAINT\n\nLeft wrist and hand pain.\n...
1,aci,ACI004,[doctor] hi billy how are you what's been goin...,CHIEF COMPLAINT\n\nDifficulty urinating.\n\nME...
2,aci,ACI016,[doctor] so anna good to see you today so read...,HISTORY OF PRESENT ILLNESS\n\nAnna Diaz is a p...
3,aci,ACI020,[doctor] hey mason good to see you today so le...,CHIEF COMPLAINT\n\nKidney stones.\n\nHISTORY O...
4,aci,ACI025,[doctor] so sophia i see that you you hurt you...,CHIEF COMPLAINT\n\nRight knee pain.\n\nMEDICAL...


In [7]:
all_logs = []
all_ehrs = []
all_notes = []
all_conversations = []

for i in range(5):
    
    conversation = df['dialogue'][i]
    note = df['note'][i]

    ehr, logs, total_prompt_tokens, total_completion_tokens, total_tokens = conversation_to_ehr(conversation, client, embedding_model, cpt_vector_store, icd_vector_store, ndc_vector_store)
    all_logs.append(logs)
    all_ehrs.append(ehr)
    all_notes.append(note)
    all_conversations.append(conversation)

    print(i, total_prompt_tokens, total_completion_tokens, total_tokens)

0 2240 371 2611
1 4600 653 5253
2 2520 422 2942
3 2353 425 2778
4 2162 433 2595


In [11]:
import pandas as pd

data = {
    "conversation": all_conversations,
    "note": all_notes,
    "ehr": all_ehrs,
    "logs": all_logs
}


df = pd.DataFrame(data)
df.head()

Unnamed: 0,conversation,note,ehr,logs
0,[doctor] hey george how are you today i unders...,CHIEF COMPLAINT\n\nLeft wrist and hand pain.\n...,{'Subjective': {'Chief complaint': ['Numbness ...,"Tokens used for SOAP generation: 1777, 352, 21..."
1,[doctor] hi billy how are you what's been goin...,CHIEF COMPLAINT\n\nDifficulty urinating.\n\nME...,{'Subjective': {'Chief complaint': ['Difficult...,"Tokens used for SOAP generation: 2053, 529, 25..."
2,[doctor] so anna good to see you today so read...,HISTORY OF PRESENT ILLNESS\n\nAnna Diaz is a p...,{'Subjective': {'Chief complaint': ['Back pain...,"Tokens used for SOAP generation: 1562, 401, 19..."
3,[doctor] hey mason good to see you today so le...,CHIEF COMPLAINT\n\nKidney stones.\n\nHISTORY O...,{'Subjective': {'Chief complaint': ['Evaluatio...,"Tokens used for SOAP generation: 1400, 407, 18..."
4,[doctor] so sophia i see that you you hurt you...,CHIEF COMPLAINT\n\nRight knee pain.\n\nMEDICAL...,{'Subjective': {'Chief complaint': ['Right kne...,"Tokens used for SOAP generation: 1334, 402, 17..."


In [12]:
csv_path = './data/LoganReview.csv'
df.to_csv(csv_path, index=False)

In [9]:
i = 0

print(all_conversations[i])
print()

print()
print(all_logs[i])

[doctor] hey george how are you today i understand you're here for some numbness and tingling in your fingers and some pain in your wrist
[patient] right my left wrist and hand has been bothering me probably for a few months now with pain and numbness
[doctor] okay and you said that's been ongoing for several months do you know what caused this type of pain or is it just something that started slowly or
[patient] it just kinda started on it's own it i notice it mostly at night
[doctor] okay
[patient] sometimes it will i'll wake up and my hands asleep and i got ta shake it out
[doctor] shake it out and okay
[patient] and then some
[doctor] what kind of work do you do
[patient] i do yard work
[doctor] yard work
[patient] landscaping landscaping
[doctor] landscaping okay so a lot of raking a lot of digging so a lot of repetitive type movements
[patient] yeah it's pretty heavy labor but it's yeah the same thing day in and day out
[doctor] okay okay just a couple questions for you you did s

In [10]:
print(all_notes[i])
print()
print(all_ehrs[i])

CHIEF COMPLAINT

Left wrist and hand pain.

HISTORY OF PRESENT ILLNESS

George Lewis is a pleasant 57-year-old male who presents to the clinic today for evaluation of left wrist and hand pain. He reports an onset of a few months ago but denies any specific injury. However, the patient notes he often engages in repetitive motions while performing his work duties. His symptoms are worse at night, and he wakes with numbness in the bilateral hands. He experiences numbness in all fingers, but states it is the most noticeable in the left thumb and index finger. He affirms intermittent numbness in the left little finger. For relief, he shakes his hands upon waking. The patient also experiences weakness in his left hand. He reports he drops objects and explains “I have a hard time feeling it.”

MEDICAL HISTORY

The patient denies a history of rheumatoid arthritis.

SOCIAL HISTORY

He works in landscaping. He reports consuming 1 to 2 beers on weekends.

REVIEW OF SYSTEMS

Musculoskeletal: Repor