In [10]:
import io
from glob import glob
import re

import ipywidgets as widgets
from IPython.display import display, HTML, Javascript, clear_output
from IPython.display import IFrame

from transformers import LlamaForCausalLM, LlamaTokenizer, GenerationConfig
from langchain.docstore.document import Document
from langchain.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import NLTKTextSplitter
from peft import PeftModel

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 110
CUDA SETUP: Loading binary /opt/conda/envs/py310/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda110.so...


  warn(msg)


In [11]:
print("Loading transformer embeddings...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

print("Loading Vicuna tokenizer")
tokenizer = LlamaTokenizer.from_pretrained("eachadea/vicuna-13b-1.1")

print("Loading Vicuna model")
model = LlamaForCausalLM.from_pretrained("eachadea/vicuna-13b-1.1", load_in_8bit=True, device_map="auto")
model = PeftModel.from_pretrained(model, "kmnis/medVicuna")

Loading transformer embeddings...




Loading Vicuna tokenizer
Loading Vicuna model


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [9]:
IFrame(src="Sample-Medical-Report.pdf", width=700, height=600)

In [38]:
fp = open("Sample-Medical-Report.pdf", 'rb')
rsrcmgr = PDFResourceManager()
retstr = io.StringIO()
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec='utf-8', laparams=laparams)

interpreter = PDFPageInterpreter(rsrcmgr, device)

for page_num, page in enumerate(PDFPage.get_pages(fp)):
    if page_num < 9:
        interpreter.process_page(page)
        data =  retstr.getvalue()

pages = re.split("- [0-9] -", data)[1:9]

In [119]:
sections_found = {
    'Patient Demographic or Personal Information': False,
    'Chief Complaint': False,
    'History of Present Illness': False,
    'Medical or Clinical History': False,
    'Review of Systems (ROS)': False,
    'Physical Examination or Mental State Examination': False,
    'Diagnostic Tests': False,
    'Assessment and Plan': False,
    'Discharge Summary': False,
    'Medication Reconciliation': False
}

def transcribe_page(page, sections_found):
    # page = " ".join(page.split()).strip()
    text_splitter = NLTKTextSplitter(chunk_size=1000)
    docs = [Document(page_content=page)]
    docs = text_splitter.split_documents(docs)
    for s, v in sections_found.items():
        if v:
            continue
        
        qdrant = Qdrant.from_documents(
            docs, embeddings, 
            location=":memory:",  # Local mode with in-memory storage only
            collection_name=f"page{s}",
        )
        question = f"{s} of patient"
        search_results = qdrant.similarity_search_with_score(question, k=1)
        relevant_prompts = " ".join([r[0].page_content for r in search_results])
        relevant_prompts = " ".join(relevant_prompts.split()).strip()
        
        # print(f"----- {s} -----")
        # print(relevant_prompts, "\n")
        
        if not relevant_prompts:
            continue
        prompt = f"""### Below is a page from the medical report of a patient. Answer the Question truthfully and only from the page content.
### Page:
{relevant_prompts}

### Question:
Extract any information about {s} from this page. If it's not found, say Not Found.

### Answer:
"""
        inputs = tokenizer(prompt, return_tensors="pt")
        input_ids = inputs["input_ids"].cuda()

        generation_config = GenerationConfig(temperature=0.6, top_p=0.95, repetition_penalty=1.15)

        generation_output = model.generate(input_ids=input_ids, generation_config=generation_config,
                                           return_dict_in_generate=True, output_scores=False, max_new_tokens=100)

        for out in generation_output.sequences:
            out = tokenizer.decode(out)
            out = out.split("### Answer:")[1].split("</s>")[0].strip()
            # if "not found" not in out.lower(): # and "not mention" not in out.lower() and "not include" not in out.lower() and "not provide" not in out.lower():
            display(HTML(f"<b>{s}:</b>"))
            print(out + "\n")
            sections_found[s] = True

In [120]:
transcribe_page(data, sections_found)

Patient's full name: Mr Tan Ah Kow
Patient's age: 55 years old



Not Found



Not Found



The patient, Mr Tan Ah Kow, has a history of hypertension and hyperlipidemia since 1990 and suffered several strokes in 2005. He subsequently developed heart problems (cardiomyopathy), cardiac failure and chronic renal disease and was treated in ABC Hospital.



Not Found



Not Found



Not Found



Not Found



Not Found



Not Found

