In [5]:
import os
import fitz  # PyMuPDF


def find_keywords_in_pdf(pdf_path, keywords):
    """
    Search for the keywords in the specified PDF and return a dictionary containing paragraphs for each keyword.
    """
    paragraphs_by_keyword = {keyword: [] for keyword in keywords}
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text = page.get_text("text")
            for paragraph in text.split('\n\n'):
                for keyword in keywords:
                    if keyword.lower() in paragraph.lower():
                        paragraphs_by_keyword[keyword].append(paragraph)
                        break  # Assumes a paragraph will be captured only once for the first keyword found
    return paragraphs_by_keyword

def process_pdfs_in_directory(directory, keywords, output_file):
    """
    Process all PDFs in the given directory to find the keywords and output sorted paragraphs to a text file.
    """
    pdf_paragraphs = {}
    for filename in sorted(os.listdir(directory)):
        if filename.lower().endswith('.pdf'):
            pdf_path = os.path.join(directory, filename)
            paragraphs_by_keyword = find_keywords_in_pdf(pdf_path, keywords)
            pdf_paragraphs[filename] = paragraphs_by_keyword

    with open(output_file, 'w', encoding='utf-8') as out_file:
        for pdf_name, paragraphs_by_keyword in pdf_paragraphs.items():
            for keyword, paragraphs in paragraphs_by_keyword.items():
                if paragraphs:  # Check if there are any paragraphs for the keyword
                    out_file.write(f"PDF File: {pdf_name}\n")
                    out_file.write(f"Keyword: {keyword}\n")
                    for paragraph in paragraphs:
                        out_file.write(f"{paragraph}\n\n")
                    out_file.write("\n")  # Extra newline for separation between keywords

if __name__ == "__main__":
    directory = "PDF"  # Change this to the path of your PDF directory
    keywords = ["lgr5", "anti-lgr5", "lgr5 primary antibody"]  # Replace with your list of keywords
    output_file = "output.txt"  # The output file path
    
    process_pdfs_in_directory(directory, keywords, output_file)
    print("Processing complete. Check the output.txt file.")

Processing complete. Check the output.txt file.
