In [52]:
import os
import random
import numpy as np
import re
import json
import pandas as pd
from langchain_community.document_loaders import WebBaseLoader
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import TokenTextSplitter
from langchain.embeddings import OllamaEmbeddings
from langchain.vectorstores import Chroma
from langchain_ollama import ChatOllama
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [None]:
# Initialize embedding model
ollama_embeddings = OllamaEmbeddings(model="all-minilm", show_progress=True)


OllamaEmbeddings: 100%|██████████| 55/55 [01:56<00:00,  2.11s/it]


In [None]:

# Load and parse the document
pdf_path = "../data/Clinical-Guidelines-Covid.pdf"
pdf_loader = PyMuPDFLoader(pdf_path)
docs_list = pdf_loader.load()


In [None]:

# Split documents into chunks and create embeddings
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=50)
doc_splits = text_splitter.split_documents(docs_list)

content = [doc.page_content for doc in doc_splits]
embeddings = ollama_embeddings.embed_documents(content)

In [None]:
# Extract JSON from response content
def extract_json_content(response):
    response_content = response.content if hasattr(response, "content") else str(response)
    json_match = re.search(r'\{[\s\S]*\}', response_content)
    
    if json_match:
        json_str = json_match.group() 
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            print("Failed to decode JSON.")
            return None
    else:
        print("No JSON content found in the response.")
        return None

In [None]:
num_iterations = 15  # Number of times to select random reference chunk and generate Q&A pairs
top_k = 3           # Number of most similar contexts to retrieve

In [None]:
query_generator = ChatOllama(model="llama3.2:3b", seed=42, temp=0)

In [None]:
# Loop for generating synthetic question-answer pairs for each random selection

all_synthetic_qa_pairs = []

for _ in range(num_iterations):
    # Select a random reference chunk 
    reference_index = random.randint(0, len(embeddings) - 1)
    reference_embedding = embeddings[reference_index]
    contexts = [content[reference_index]]

    # Calculate similarities with other chunks
    similarities = []
    for i, embedding in enumerate(embeddings):
        dot_product = np.dot(reference_embedding, embedding)
        norm_product = np.linalg.norm(reference_embedding) * np.linalg.norm(embedding)
        similarity = dot_product / norm_product
        similarities.append((i, similarity))

    
    # Find top K similar contexts
    similarity_df = pd.DataFrame(similarities, columns=["index", "similarity"])
    top_k_similar = similarity_df.nlargest(top_k, "similarity")
    for idx in top_k_similar["index"]:
        contexts.append(content[idx])

    # Step 2: Generate synthetic question-answer pairs using `llama3.2:3b` model
    prompt = f"""
    Based on the provided context (a list of text segments) generate a relevant 
    question that can be addressed by the context and a concise answer derived 
    solely from the provided context.
    Please respond strictly in the below JSON format without any additional commentary 
    or formatting around the JSON object and without any newline characters within the
    generated JSON as shown below:
    {{"question": <a relevant question that can be addressed by the context>
      "answer": <a concise answer derived solely from the provided context>}}

    contexts:
    {contexts}
    """

    
    response = query_generator.invoke(prompt)

    # synthetic_qa_pair = response.json()  
    # all_synthetic_qa_pairs.append(synthetic_qa_pair)

    # Extract and append parsed JSON content to the list
    parsed_json = extract_json_content(response)
    if parsed_json:
        all_synthetic_qa_pairs.append(parsed_json)



In [None]:
processed_data = [{"question": qa["question"], "answer": qa["answer"]} for qa in all_synthetic_qa_pairs if qa]

qa_df = pd.DataFrame(processed_data)


In [None]:
# Output dataframe
qa_df

Unnamed: 0,question,answer
0,Who compiled the COVID-19 guidelines?,"Sunil Dodani, Dr Amjad Mahboob, Dr Naseem Akht..."
1,What is the recommended duration for hospital ...,at least 48 hours after treatment initiation
2,What is the recommended treatment approach for...,"Supportive care only, including acetaminophen ..."
3,What is the mainstay of management for severe ...,Oxygen therapy via nasal cannula or face mask....
4,What is the mainstay of management for severe ...,Oxygen therapy via nasal cannula or face mask
5,What is the CALL score and how is it interpret...,"The CALL score (co-morbid, Age, Lymphocytes, L..."
6,What is the recommended approach for managing ...,Home oxygen therapy may be considered in those...
7,What is the recommended duration for testing a...,2 to 5 days after symptoms onset
8,What criteria must a patient fulfill to be rel...,"If possible, the patient should have no fever ..."
9,What is the recommended approach for managing ...,The management of reinfection is the same as i...


In [59]:
for _, row in qa_df.iterrows():
    print('Question: ',row['question'],'\n')
    print('Answer: ',row['answer'],'\n','****')

Question:  Who compiled the COVID-19 guidelines? 

Answer:  Sunil Dodani, Dr Amjad Mahboob, Dr Naseem Akhtar, Dr Asma Adil, Dr Javed Bhutta, Dr Urooj Aqeel and HSA/ HPSIU/ NIH team. 
 ****
Question:  What is the recommended duration for hospital monitoring of patients who received biologics during COVID-19 hospitalization? 

Answer:  at least 48 hours after treatment initiation 
 ****
Question:  What is the recommended treatment approach for patients with non-severe COVID-19 disease? 

Answer:  Supportive care only, including acetaminophen or NSAIDs for fever, oral hydration in case of diarrhea, and antihistamines for rhinorrhea 
 ****
Question:  What is the mainstay of management for severe and critical COVID-19? 

Answer:  Oxygen therapy via nasal cannula or face mask. If available, high flow oxygen can also be used to maintain saturations above 94%. 
 ****
Question:  What is the mainstay of management for severe and critical COVID-19? 

Answer:  Oxygen therapy via nasal cannula or f

In [None]:
# Removing duplicate questions
qa_df = qa_df.drop_duplicates(subset=['question'], keep='first').reset_index()

In [None]:
# Save QA pairs
qa_df.to_csv('clinical-qa-pairs.csv')