In [1]:
import re
import os
import PyPDF2
from transformers import pipeline
from langchain import PromptTemplate, LLMChain, OpenAI
from langchain_community.embeddings import OllamaEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
# Function to anonymize PII from the extracted text
def anonymize_pii(text):
    # Replace any patterns that resemble personal information
    # text = re.sub(r'\b(?:\d{1,2}/\d{1,2}/\d{2,4}|\d{4}[-/]\d{2}[-/]\d{2})\b', '[REDACTED DATE]', text)  # Date of birth
    # text = re.sub(r'\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b', '[REDACTED PHONE]', text)  # Phone numbers
    # text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[REDACTED EMAIL]', text)  # Emails
    # text = re.sub(r'\b\d{4}\b', '[REDACTED YEAR]', text)  # Years
    # text = re.sub(r'\b[A-Z][a-z]+\b', '[REDACTED NAME]', text)  # Names (very basic)
    return text

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return anonymize_pii(text)

# Extracting text from all PDFs in a directory
pdf_dir = './data/'
documents = []

In [3]:
from langchain_core.documents import Document


In [4]:
for filename in os.listdir(pdf_dir):
    if filename.endswith('.pdf'):
        text = extract_text_from_pdf(os.path.join(pdf_dir, filename))
        documents.append(Document(page_content=text))

# Step 1: Create embeddings for the documents
embedding_model = OllamaEmbeddings(model="llama3.1")
docsearch = FAISS.from_documents(documents, embedding_model)

In [5]:
from langchain_community.llms import Ollama
llm = Ollama(model="llama3.1")

In [45]:
# Step 3: Create a RetrievalQA chain using the local LLM
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
)

# # Step 4: Define a query
# query = "Show me any information about the medical data that was collected."

# # Step 5: Run the query
# result = qa_chain({ "query": query })
# print(result)

In [7]:
# Step 4: Define a query
query = "Show me any information related to back pain."

# Step 5: Run the query
result = qa_chain({ "query": query })
print(result['result'])

{'query': 'Show me any information related to back pain.', 'result': 'Here is the relevant information:\n\n**PART C: GENERAL HEALTH SCREENING**\n\n1. Have you been involved in any motor vehicle accidents resulting in personal injury? No\n2. (skipped)\n3. **Have you suffered back pain or strain injury (including back surgery)? No**\n4. (skipped)\n5. (skipped)\n\nSo, according to this form, John Doe has not reported any history of back pain or strain injury.', 'source_documents': [Document(page_content='Pre-Employment Health Assessment\nPART A: PERSONAL DETAILS\nSurname: Doe\nFirst Name: John\nDate of Birth: 01/15/1998\nGender: Male\nAddress: 1234 Health Street, Sydney, NSW, 2000\nPhone No (Mobile): +61 400 123 456\nEmail: john.doe@example.com\nPresent Employee of NNSWLHN: No\nPART B: POSITION DETAILS\nPosition Applied For: Registered Nurse\nRecruitment No: 56789\nHospital/Facility: Northern NSW LHD\nWard/Dept: Emergency Department\nPART C: GENERAL HEALTH SCREENING\n1. Have you been invo

In [24]:
import nest_asyncio

nest_asyncio.apply()

In [35]:
from nemoguardrails import RailsConfig, LLMRails
from nemoguardrails.integrations.langchain.runnable_rails import RunnableRails

config = RailsConfig.from_path("./guardrails")
app = LLMRails(config,llm=llm)


In [46]:
app.register_action(qa_chain, name="qa_chain")

In [49]:
# Step 4: Define a query
query = "Show me any information related to  GENERAL HEALTH SCREENING in Pre-Employment Health Assessment."

# Step 5: Run the query
result = qa_chain({ "query": query })
print(result['result'])

Here is the information related to GENERAL HEALTH SCREENING from Part C:

**1. Have you been involved in any motor vehicle accidents resulting in personal injury?**
No

**2. Have you ever lodged a claim for workers compensation?**
No

**3. Have you suffered back pain or strain injury (including back surgery)?**
No

**4. Have you suffered from shoulder, neck or arm pain or strain?**
No

**5. Have you suffered from hip, knee or ankle pain?**
No

**6. Have you had a full medical clearance for any injury identified in questions 1 to 5?**
N/A

**7. Are you receiving any ongoing treatment for injuries identified in questions 1 to 5?**
No

**8. Do you suffer from any medical condition (including physical, psychiatric, psychological) for which you are receiving treatment?**
No


In [50]:
# mode == "chain_with_guardrails":
history = [{"role": "user", "content": query}]
#
result = await app.generate_async(messages=history)
print(result['content'])

I'd be happy to help you with that! General health screening is a crucial component of pre-employment health assessments, and it's typically conducted by healthcare professionals. The process usually involves a thorough medical examination, review of past medical history, and laboratory tests to identify any underlying conditions that may affect an individual's ability to perform their job duties. However, I need more specific information about the type of screening you're looking for, such as the industry or occupation, in order to provide more tailored results.
