In [1]:
import os
from typing import List, Sequence
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed

from pdf2image import convert_from_path
import pytesseract

from langchain_community.document_loaders import PyPDFLoader
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
from langchain_core.output_parsers.string import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.messages import BaseMessage, HumanMessage
from langgraph.graph import END, MessageGraph

In [2]:
load_dotenv()

True

In [3]:
POPPLER_PATH = "C:/poppler-24.08.0/Library/bin"
NUM_THREADS = 10
DPI = 150
BATCH_SIZE = 10

In [4]:
pdf_path = "Pdf_Docs/test.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()

Ignoring wrong pointing object 11 0 (offset 0)


In [5]:
docs[0].page_content

'St. Francis Medical Center\n6001 E Woodmen Rd\nColorado Springs CO 80923-2601\nGallardo, Jesus Michael Jr. "Jes"\nMRN: CEUL0641381, DOB: 5/1/1942, Sex: M\nAdm: 10/30/2023, D/C: 11/3/2023\n10/30/2023 - Admission (Discharged) in St. Francis Medical Center 6N Ortho/Surg\nFACESHEET\nPatient Demographics\nHospital Account\nName Acct ID Class Status Primary Coverage\nGallardo, Jesus Michael Jr. 2301583980 Inpatient Closed MEDICARE -\nMEDICARE PART A\nAND B\nGuarantor Account (for Hospital Account #2301583980)\nName\nRelation to\nPt Service Area Active? Acct Type\nGallardo, Jesus Michael Jr. Self CEN Yes Personal/Family\nAddress Phone\n115 W 5TH ST\nFLORENCE, CO 81226\n719-784-4443(H)\nCoverage Information (for Hospital Account #2301583980)\nPrinted by 23573 at 4/2/24  9:28 AM Page  1\nSSN Gender Identity Birth Date\nReg Status PCP Date Last Verified Next Review Date\nVerified Joseph Thomas\nMcGarry, MD719-784-\n4816\n03/22/24 05/21/24\nGallardo, Jesus Michael\nJr. "Jes"\nCEUL0641381\nReligi

In [6]:
pages_image_loader = convert_from_path(pdf_path, dpi=DPI, poppler_path=POPPLER_PATH)

In [7]:
len(pages_image_loader)

34

In [8]:
def process_page(page, image):
    if not page.page_content:
        text = pytesseract.image_to_string(image)
        page.page_content = text
    return page

In [9]:
trial_page = docs[0]
trial_image = pages_image_loader[0]

In [None]:
trial_image

In [11]:
text = pytesseract.image_to_string(trial_image)

In [12]:
text

'St. Francis Medical Center\n6001 E Woodmen Rd\nColorado Springs CO 80923-2601\n\nGallardo, Jesus Michael Jr. "Jes"\nMRN: CEUL0641381, DOB: 5/1/1942, Sex: M\n\nAdm: 10/30/2023, D/C: 11/3/2023\n\n10/30/2023 - Admission (Discharged) in St. Francis Medical Center 6N Ortho/Surg\n\nFACESHEET\nPatient Demographics\nName Patient ID SSN Gender Identity Birth Date\nGallardo, Jesus Michael CEUL0641381 XXx-Xx-6010 Male 05/01/42 (81 yrs)\nJr. "Jes"\nAddress Phone Email\n115 W 5TH ST 719-784-4443 (H) lynn45@earthlink.net\n\nFLORENCE CO 81226 719-784-4443 (M)\n\nReg Status PCP. Date Last Verified Next Review Date\nVerified Joseph Thomas 03/22/24 05/21/24\nMcGarry, MD719-784-\n4816\nReligion Language\nCatholic English\n\nEmergency Contact 1\n\nEmergency Contact 2\n\nTonilynn Gallardo (Spouse)\n115 W STH ST\nFLORENCE CO 81226\n\nUS\n\n719-784-4443 (H)\n719-650-5142 (M)\n\nHospital Account\n\nRichard Gallardo (Son)\n115 W 5TH ST\nFLORENCE CO 81226\nUS\n\n719-650-4185 (H)\n\nName Acct ID Class Status Pr

In [13]:
with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
    futures = [executor.submit(process_page, page, pages_image_loader[idx]) for idx, page in enumerate(docs)]
    
    process_pages = [future.result() for future in as_completed(futures) if future.result() is not None]

In [None]:
process_pages

In [None]:
with ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
    futures = []
    for idx, page in enumerate(docs):
        result = executor.submit(process_page, page, pages_image_loader[idx])
        futures.append(result)
        
    process_pages = []
    for future in as_completed(futures):
        if future.result() is not None:
            process_pages.append(future.result())

In [16]:
def format_text(processed_pages):
    formatted_text = "\n\n".join(page.page_content for page in processed_pages)
    return formatted_text

In [18]:
formatted_text = format_text(process_pages)

In [19]:
formatted_text

'St. Francis Medical Center\n6001 E Woodmen Rd\nColorado Springs CO 80923-2601\nGallardo, Jesus Michael Jr. "Jes"\nMRN: CEUL0641381, DOB: 5/1/1942, Sex: M\nAdm: 10/30/2023, D/C: 11/3/2023\n10/30/2023 - Admission (Discharged) in St. Francis Medical Center 6N Ortho/Surg (continued)\nClinical Notes (group 1 of 2) (continued)\nCan patient return to previous\nliving arrangement? No\nPCP Comment Joseph Thomas McGarry, MD\nDC Planning Contacts\nDC Plan discussed and\naccepted by designated\ncaregiver?\nSpoke with patient and spouse at bedside to confirm\nd/c plan\nPost Acute\nProposed Post Acute Disposition Acute Rehab Center\nName of accepting person at\nfacility/agency/liaison Kerri T\nCare Facility Name and Address Capron 6001 E Woodmen Rd COS CO\nBarriers to Discharge (surgery scheduled 11/2 at 1600 laminectomy)\nPatient/rep confirmed and\nreviewed goals, preferences,\nand DC plan?\nYes\nProvider confirmed and reviewed\ngoals, preferences, and DC\nplan?\nYes\nThe CM/SW considered patient\

In [20]:
llm = ChatOpenAI(model="gpt-4o", temperature=0.1)

In [21]:
llm.invoke("hi")

AIMessage(content='Hello! How can I assist you today?', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 9, 'prompt_tokens': 8, 'total_tokens': 17, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'system_fingerprint': 'fp_46bff0e0c8', 'id': 'chatcmpl-C5ZUJVZoD0o5cOjBgVFt72XuCdLcy', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--00e57bb4-983e-4eb3-8703-32a03c153683-0', usage_metadata={'input_tokens': 8, 'output_tokens': 9, 'total_tokens': 17, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [22]:
generation_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an expert medical summary writer. Generate the best summary possible
            for user's request. If the user provides critique, respond with a revised version of your previous attempts.
            """
        ),
        MessagesPlaceholder(variable_name="messages")
    ]
)

generation_chain = generation_prompt | llm | StrOutputParser()

In [23]:
reflection_prompt = ChatPromptTemplate.from_messages(
    [
        ("system",
        """You are an expert medical summary writer critique. Generate critique and recommendations for the user's
        document summary. Always provide detailed recommendations"""),
        MessagesPlaceholder(variable_name="messages")
    ]
)

reflection_chain = reflection_prompt | llm | StrOutputParser()

In [24]:
def generation_node(state:Sequence[BaseMessage]):
    result = generation_chain.invoke({"messages": state})
    return result

In [25]:
def reflection_node(messages: Sequence[BaseMessage]):
    result = reflection_chain.invoke({"messages": messages})
    return [HumanMessage(content=result)]

In [27]:
def should_continue(state: List[BaseMessage]):
    if len(state) == 5:
        return END
    return "reflection"

In [34]:
builder = MessageGraph()
builder.add_node("generate", generation_node)
builder.add_node("reflection", reflection_node)

builder.set_entry_point("generate")
builder.add_conditional_edges("generate", should_continue)
builder.add_edge("reflection", "generate")

graph = builder.compile()

In [37]:
import math

def generation_node(state:Sequence[BaseMessage]):
    result = generation_chain.invoke({"messages": state})
    return result

def reflection_node(messages: Sequence[BaseMessage]):
    result = reflection_chain.invoke({"messages": messages})
    return [HumanMessage(content=result)]

def should_continue(state: List[BaseMessage]):
    if len(state) == 4:
        return END
    return "reflection"

builder = MessageGraph()
builder.add_node("generate", generation_node)
builder.add_node("reflection", reflection_node)

builder.set_entry_point("generate")
builder.add_conditional_edges("generate", should_continue)
builder.add_edge("reflection", "generate")

graph = builder.compile()
response_list = []

number_of_batches = math.ceil(len(docs) / BATCH_SIZE)

for batch in range(number_of_batches):
    start_index = batch * BATCH_SIZE
    end_index = min(start_index + BATCH_SIZE, len(docs))
    formatted_text = format_text(process_pages[start_index:end_index])
    
    if formatted_text:
        inputs = HumanMessage(content=formatted_text)
        response = graph.invoke(inputs)
        response_list.append(response)
        

In [40]:
response_list[0]

[HumanMessage(content='St. Francis Medical Center\n6001 E Woodmen Rd\nColorado Springs CO 80923-2601\nGallardo, Jesus Michael Jr. "Jes"\nMRN: CEUL0641381, DOB: 5/1/1942, Sex: M\nAdm: 10/30/2023, D/C: 11/3/2023\n10/30/2023 - Admission (Discharged) in St. Francis Medical Center 6N Ortho/Surg (continued)\nClinical Notes (group 1 of 2) (continued)\nCan patient return to previous\nliving arrangement? No\nPCP Comment Joseph Thomas McGarry, MD\nDC Planning Contacts\nDC Plan discussed and\naccepted by designated\ncaregiver?\nSpoke with patient and spouse at bedside to confirm\nd/c plan\nPost Acute\nProposed Post Acute Disposition Acute Rehab Center\nName of accepting person at\nfacility/agency/liaison Kerri T\nCare Facility Name and Address Capron 6001 E Woodmen Rd COS CO\nBarriers to Discharge (surgery scheduled 11/2 at 1600 laminectomy)\nPatient/rep confirmed and\nreviewed goals, preferences,\nand DC plan?\nYes\nProvider confirmed and reviewed\ngoals, preferences, and DC\nplan?\nYes\nThe CM/

In [41]:
combine_summary_content = ""

for response in response_list:
    final_summary = response[-1].content
    combine_summary_content += final_summary
    combine_summary_content += "-" * 50

In [52]:
combine_summary_content

'**Revised Medical Summary for Jesus Michael Gallardo Jr.**\n\n**Patient Information:**\n- Name: Jesus Michael Gallardo Jr. ("Jes")\n- MRN: CEUL0641381\n- DOB: 5/1/1942\n- Age: 81\n- Sex: Male\n- Admission Date: 10/30/2023\n- Discharge Date: 11/3/2023\n- Primary Care Physician: Joseph Thomas McGarry, MD\n\n**Admission Details:**\n- Admitted to St. Francis Medical Center, 6N Ortho/Surg Unit.\n- Reason for Admission: Chronic pain syndrome and neuropathy, failed nonsurgical treatments, and elected for permanent spinal cord stimulator placement after a successful trial.\n\n**Hospital Course:**\n- **Initial Surgery (10/30/2023):** A spinal cord stimulator was placed successfully. Post-surgery, the patient developed bilateral leg numbness and paralysis due to a dorsal epidural hematoma from T5-T9, likely exacerbated by elevated systolic blood pressure.\n- **Emergency Intervention:** The patient underwent an emergency evacuation of the hematoma and removal of the stimulator. Post-procedure, s

In [None]:
template = """You are an expert summary writer. You will be given the text. Summarize the text.
text: {text}
Summary:
"""

combined_prompt = ChatPromptTemplate.from_template(template)

combine_summary_chain = combined_prompt | llm | StrOutputParser()

combined_response = combine_summary_chain.invoke({"text": combine_summary_content}) 