In [None]:
import transformers
import torch
import pdfplumber
import os
import csv

# Initialize the LLaMA model
model_id = "meta-llama/Llama-3.3-70B-Instruct"

pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
)

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Optimized function to create a prompt for Q&A generation
def generate_prompt(text):
    return (
        f"The following is a detailed medical document:\n\n{text}\n\n"
        f"Generate as many insightful questions as possible based on the above text. "
        f"For each question, provide a detailed and comprehensive answer that is between 200 and 250 words. "
        f"Ensure the answers are informative and specific to the medical context provided."
    )

# Function to parse Q&A pairs from model output
def parse_qa_pairs(output_text):
    lines = output_text.split("\n")
    qas = []
    question = None
    answer = None

    for line in lines:
        if line.lower().startswith("question:"):
            if question and answer:  # Save the previous Q&A
                qas.append((question.strip(), answer.strip()))
            question = line[len("Question:"):].strip()
            answer = None  # Reset answer
        elif line.lower().startswith("answer:"):
            answer = line[len("Answer:"):].strip()
    # Add the last Q&A pair if it exists
    if question and answer:
        qas.append((question.strip(), answer.strip()))
    return qas

# Directory containing PDFs
pdf_directory = "/home/hamza/Desktop/qa_system/pdf"
output_csv_path = "path/to/output/medical_qna.csv"

# Open CSV file for writing
with open(output_csv_path, mode="w", newline="", encoding="utf-8") as csv_file:
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(["File Name", "Question", "Answer"])  # Write header

    # Process each PDF in the directory
    for filename in os.listdir(pdf_directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_directory, filename)
            print(f"Processing: {filename}")
            
            # Extract text from the PDF
            extracted_text = extract_text_from_pdf(pdf_path)
            
            # Skip empty PDFs
            if not extracted_text.strip():
                print(f"Skipped empty PDF: {filename}")
                continue

            # Generate the prompt
            prompt = generate_prompt(extracted_text)
            
            # Generate Q&A using the pipeline
            outputs = pipeline(prompt, max_new_tokens=1024)  # Increase token limit for detailed responses
            
            # Parse the Q&A pairs
            qna_text = outputs[0]["generated_text"]
            qna_pairs = parse_qa_pairs(qna_text)
            
            # Write each Q&A pair to the CSV
            for question, answer in qna_pairs:
                csv_writer.writerow([filename, question, answer])
            
            print(f"Processed and saved Q&A pairs for: {filename}")

print(f"Q&A saved to CSV file: {output_csv_path}")