In [11]:
import re
import os
import PyPDF2
from transformers import pipeline
from langchain import PromptTemplate, LLMChain, OpenAI
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

In [22]:
# Function to anonymize PII from the extracted text
def anonymize_pii(text):
    # Replace any patterns that resemble personal information
    # text = re.sub(r'\b(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{4}[-/]\d{2}[-/]\d{2})\b', '[REDACTED DATE]', text)  # Date of birth
    # text = re.sub(r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', '[REDACTED PHONE]', text)  # Phone numbers
    # text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[REDACTED EMAIL]', text)  # Emails
    # text = re.sub(r'\b\d{4}\b', '[REDACTED YEAR]', text)  # Years
    # text = re.sub(r'\b[A-Z][a-z]+\b', '[REDACTED NAME]', text)  # Names (very basic)
    return text

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return anonymize_pii(text)

# Extracting text from all PDFs in a directory
pdf_dir = './data/'
documents = []

In [8]:
from langchain_core.documents import Document


In [23]:
for filename in os.listdir(pdf_dir):
    if filename.endswith('.pdf'):
        text = extract_text_from_pdf(os.path.join(pdf_dir, filename))
        documents.append(Document(page_content=text))

# Step 1: Create embeddings for the documents
embedding_model = OllamaEmbeddings(model="llama3.1")
docsearch = FAISS.from_documents(documents, embedding_model)

In [18]:
from langchain_community.llms import Ollama
llm = Ollama(model="llama3.1")

In [24]:
# Step 3: Create a RetrievalQA chain using the local LLM
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    return_source_documents=True
)

# Step 4: Define a query
query = "Show me any information about the medical data that was collected."

# Step 5: Run the query
result = qa_chain({ "query": query })
print(result)

{'query': 'Show me any information about the medical data that was collected.', 'result': 'Based on the provided context, here is the medical data that was collected:\n\n**No pre-existing medical conditions were reported**\n\nHowever, it\'s worth noting that the following questions had a "N/A" (Not Applicable) response:\n\n1. Have you had a full medical clearance for any injury identified in questions 1 to 5? N/A\n\nThis suggests that John Doe has not been involved in any motor vehicle accidents or other incidents that would require a full medical clearance.\n\n**No health conditions were reported**\n\nJohn Doe answered "no" to all the health condition questions, such as:\n\n* Lower back, neck or thoracic spinal pain\n* Sciatica\n* Wrist or elbow pain or weakness\n* And many others (listed in Part C: Health Conditions)\n\nThis implies that he does not have any pre-existing medical conditions that could impact his ability to work as a Registered Nurse.', 'source_documents': [Document(pa

In [25]:
# Step 4: Define a query
query = "Show me any information related to back pain."

# Step 5: Run the query
result = qa_chain({ "query": query })
print(result)

{'query': 'Show me any information related to back pain.', 'result': 'Here is the relevant information:\n\nPART C: GENERAL HEALTH SCREENING\n\n3. Have you suffered back pain or strain injury (including back surgery)? No\n\nHealth Conditions\n- Lower back, neck or thoracic spinal pain? No', 'source_documents': [Document(page_content='Pre-Employment Health Assessment\nPART A: PERSONAL DETAILS\nSurname: Doe\nFirst Name: John\nDate of Birth: 01/15/1998\nGender: Male\nAddress: 1234 Health Street, Sydney, NSW, 2000\nPhone No (Mobile): +61 400 123 456\nEmail: john.doe@example.com\nPresent Employee of NNSWLHN: No\nPART B: POSITION DETAILS\nPosition Applied For: Registered Nurse\nRecruitment No: 56789\nHospital/Facility: Northern NSW LHD\nWard/Dept: Emergency Department\nPART C: GENERAL HEALTH SCREENING\n1. Have you been involved in any motor vehicle accidents resulting in personal injury? No\n2. Have you ever lodged a claim for workers compensation? No\n3. Have you suffered back pain or strain