In [None]:
!pip install pypdf2
!pip install PyMuPDF

### [1. OCR using PyPDF2]

**1. Input PDF_FILE_PATH and keyword**

In [None]:
# test code
from PyPDF2 import PdfReader

PDF_FILE_PATH = r"C:\Users\user\Desktop\sample\sample.pdf" # sample file path
reader = PdfReader(PDF_FILE_PATH)
pages = reader.pages
text = ""
keyword = "결제 서류"  # keyword

for page in pages:
    sub = page.extract_text()
    if sub:  
        text += sub

# check keyword existence
if keyword in text:
    print(f'"{keyword}"is existing in the PDF file.')
else:
    print(f'"{keyword}"is not existing in the PDF file.')

**2. Run next cell if the keyword test was correct**

In [None]:
import tkinter as tk
from tkinter import filedialog, simpledialog
from PyPDF2 import PdfReader, PdfWriter
import glob
import os

def find_keyword_end_page(pdf_path, keyword):
    pdf_reader = PdfReader(pdf_path)
    end_page = None
    for i, page in enumerate(pdf_reader.pages):
        text = page.extract_text()
        if text and keyword in text:
            end_page = i
            break
    return end_page

def extract_pages(pdf_path, start_page, end_page, output_folder, output_prefix):
    pdf_reader = PdfReader(pdf_path)
    pdf_writer = PdfWriter()
    fname = os.path.splitext(os.path.basename(pdf_path))[0]
    output_filename = os.path.join(output_folder, f'{fname}_{output_prefix}.pdf')
    for page in range(start_page - 1, min(end_page, len(pdf_reader.pages))):
        pdf_writer.add_page(pdf_reader.pages[page])
    with open(output_filename, 'wb') as out_file:
        pdf_writer.write(out_file)
    print(f'Created: {output_filename}')

def process_pdf(pdf_path, output_folder, output_prefix, keyword):
    end_page = find_keyword_end_page(pdf_path, keyword)
    if end_page:
        extract_pages(pdf_path, 1, end_page, output_folder, output_prefix)
        print(f'Keyword "{keyword}" found in {pdf_path}')
    else:
        print(f'Keyword "{keyword}" not found in {pdf_path}')

def extract_single_pdf():
    pdf_path = filedialog.askopenfilename(title="Select PDF file", filetypes=[("PDF files", "*.pdf")])
    if pdf_path:
        output_folder = filedialog.askdirectory(title="Folder to save")
        if output_folder:
            output_prefix = simpledialog.askstring("Input", "suffix of output file name:")
            keyword = simpledialog.askstring("Input", "Keyword :")
            if output_prefix and keyword:
                process_pdf(pdf_path, output_folder, output_prefix, keyword)

def extract_all_pdfs_in_folder():
    folder_path = filedialog.askdirectory(title="Select PDF file")
    if folder_path:
        output_folder = filedialog.askdirectory(title="Folder to save")
        if output_folder:
            output_prefix = simpledialog.askstring("Input", "suffix of output file name:")
            keyword = simpledialog.askstring("Input", "Keyword to find :")
            if output_prefix and keyword:
                pdf_files = glob.glob(os.path.join(folder_path, '*.pdf'))
                for pdf_file in pdf_files:
                    process_pdf(pdf_file, output_folder, output_prefix, keyword)

def main():
    root = tk.Tk()
    root.title("PDF Keyword Page Extractor")

    extract_single_btn = tk.Button(root, text="Extract Pages from Single PDF by Keyword", command=extract_single_pdf)
    extract_single_btn.pack(pady=10)

    extract_all_btn = tk.Button(root, text="Extract Pages from All PDFs in Folder by Keyword", command=extract_all_pdfs_in_folder)
    extract_all_btn.pack(pady=10)

    root.mainloop()

if __name__ == "__main__":
    main()

### [2. OCR using PyMuPDF]

**1. Input PDF_FILE_PATH and keyword**

In [1]:
import tkinter as tk
from tkinter import filedialog, simpledialog
import fitz  # PyMuPDF
import glob
import os
 
def find_keyword_start_page(pdf_path, keyword):
    doc = fitz.open(pdf_path)
    start_page = None
    for i, page in enumerate(doc):
        text = page.get_text()
        if keyword in text:
            start_page = i+1
            break
    doc.close()
    return start_page
 
def extract_pages(pdf_path, start_page, end_page, output_folder, output_prefix):
    doc = fitz.open(pdf_path)
    fname = os.path.splitext(os.path.basename(pdf_path))[0]
    output_filename = os.path.join(output_folder, f'{fname}_{output_prefix}.pdf')
    doc.select(range(start_page - 1, end_page))  # page range
    doc.save(output_filename)
    doc.close()
    print(f'Created: {output_filename}')
    
def process_pdf(pdf_path, output_folder, output_prefix, keyword):
    start_page = find_keyword_start_page(pdf_path, keyword)
    if start_page:
        doc = fitz.open(pdf_path)
        end_page = len(doc)  # 문서의 총 페이지 수
        extract_pages(pdf_path, start_page, end_page, output_folder, output_prefix)
        print(f'Keyword "{keyword}" found in {pdf_path}')
        doc.close()
    else:
        print(f'Keyword "{keyword}" not found in {pdf_path}')
        
def extract_single_pdf():
    pdf_path = filedialog.askopenfilename(title="Select PDF file", filetypes=[("PDF files", "*.pdf")])
    if pdf_path:
        output_folder = filedialog.askdirectory(title="Folder to save")
        if output_folder:
            output_prefix = simpledialog.askstring("Input", "suffix of output file name:")
            keyword = simpledialog.askstring("Input", "Keyword :")
            if output_prefix and keyword:
                process_pdf(pdf_path, output_folder, output_prefix, keyword)

def extract_all_pdfs_in_folder():
    folder_path = filedialog.askdirectory(title="Select PDF file")
    if folder_path:
        output_folder = filedialog.askdirectory(title="Folder to save")
        if output_folder:
            output_prefix = simpledialog.askstring("Input", "suffix of output file name:")
            keyword = simpledialog.askstring("Input", "Keyword to find:")
            if output_prefix and keyword:
                pdf_files = glob.glob(os.path.join(folder_path, '*.pdf'))
                for pdf_file in pdf_files:
                    process_pdf(pdf_file, output_folder, output_prefix, keyword)

def main():
    root = tk.Tk()
    root.title("PDF Keyword Page Extractor with PyMuPDF")
 
    extract_single_btn = tk.Button(root, text="Extract Pages from Single PDF by Keyword", command=extract_single_pdf)
    extract_single_btn.pack(pady=10)
 
    extract_all_btn = tk.Button(root, text="Extract Pages from All PDFs in Folder by Keyword", command=extract_all_pdfs_in_folder)
    extract_all_btn.pack(pady=10)
 
    root.mainloop()
    
if __name__ == "__main__":
    main()

Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\Lib\tkinter\__init__.py", line 1948, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Temp\ipykernel_5260\2708184998.py", line 58, in extract_all_pdfs_in_folder
    process_pdf(pdf_file, output_folder, output_prefix, keyword)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_5260\2708184998.py", line 32, in process_pdf
    extract_pages(pdf_path, start_page, end_page, output_folder, output_prefix)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_5260\2708184998.py", line 22, in extract_pages
    doc.select(range(start_page - 1, end_page))  # page range
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\fitz\__init__.py", line 5418, in select
    retainpages(pdf, pyliste)
  File "C:\Users\user\anaconda3\Lib\site-packages\fitz\__init__.py", line 20962, in retainpages
    if strip_outlines(

In [13]:
# test code
import fitz  # PyMuPDF

def find_keyword_end_page(pdf_path, keyword):
    doc = fitz.open(pdf_path)
    end_page = None
    for i, page in enumerate(doc):
        text = page.get_text()
        if keyword in text:
            end_page = i
            break
    doc.close()
    return end_page

# test code
PDF_FILE_PATH = r"C:\Users\user\Desktop\sample\자격증_결제서류_연수일지.pdf"
keyword = "충원요청서"
find_keyword_end_page(PDF_FILE_PATH, keyword)

In [16]:
import tkinter as tk
from tkinter import filedialog, simpledialog
import fitz  # PyMuPDF
import glob
import os
import shutil

def find_keyword_end_page(pdf_path, keyword):
    try:
        doc = fitz.open(pdf_path)
    except Exception as e:
        print(f"Error opening PDF file: {e}")
        return None

    end_page = None
    for i, page in enumerate(doc):
        text = page.get_text()
        if keyword in text:
            end_page = i
            break
            
    if end_page is None:  # 키워드가 발견되지 않은 경우
        end_page = len(doc)  # 문서의 전체 페이지 수를 반환
    doc.close()
    return end_page

def extract_pages(pdf_path, start_page, end_page, output_folder, output_prefix):
    doc = fitz.open(pdf_path)
    fname = os.path.splitext(os.path.basename(pdf_path))[0]
    output_filename = os.path.join(output_folder, f'{fname}_{output_prefix}.pdf')
    doc.select(range(start_page - 1, end_page))  # page range
    doc.save(output_filename)
    doc.close()
    print(f'Created: {output_filename}')

def process_pdf(pdf_path, output_folder, output_prefix, keyword, not_found_folder):
    end_page = find_keyword_end_page(pdf_path, keyword)
    if end_page == len(fitz.open(pdf_path)):  # 키워드가 발견되지 않았을 경우
        # 파일을 not_found_folder로 이동
        dest_path = os.path.join(not_found_folder, os.path.basename(pdf_path))
        shutil.move(pdf_path, dest_path)  # 파일 이동
        print(f'Keyword "{keyword}" not found in {pdf_path}, moved to {dest_path}')
    else:
        extract_pages(pdf_path, 1, end_page, output_folder, output_prefix)
        print(f'Keyword "{keyword}" found in {pdf_path}, extracted up to page {end_page}')
        
def extract_single_pdf():
    pdf_path = filedialog.askopenfilename(title="Select PDF file", filetypes=[("PDF files", "*.pdf")])
    if pdf_path:
        output_folder = filedialog.askdirectory(title="Folder to save")
        if output_folder:
            output_prefix = simpledialog.askstring("Input", "suffix of output file name:")
            keyword = simpledialog.askstring("Input", "Keyword :")
            if output_prefix and keyword:
                process_pdf(pdf_path, output_folder, output_prefix, keyword)

def extract_all_pdfs_in_folder():
    folder_path = filedialog.askdirectory(title="Select PDF file")
    if folder_path:
        output_folder = filedialog.askdirectory(title="Folder to save")
        not_found_folder = filedialog.askdirectory(title="Select folder to save PDFs where keyword is not found")
        if output_folder and not_found_folder:
            output_prefix = simpledialog.askstring("Input", "suffix of output file name:")
            keyword = simpledialog.askstring("Input", "Keyword to find:")
            if output_prefix and keyword:
                pdf_files = glob.glob(os.path.join(folder_path, '*.pdf'))
                for pdf_file in pdf_files:
                    process_pdf(pdf_file, output_folder, output_prefix, keyword, not_found_folder)

def main():
    root = tk.Tk()
    root.title("PDF Keyword Page Extractor with PyMuPDF")

    extract_single_btn = tk.Button(root, text="Extract Pages from Single PDF by Keyword", command=extract_single_pdf)
    extract_single_btn.pack(pady=10)

    extract_all_btn = tk.Button(root, text="Extract Pages from All PDFs in Folder by Keyword", command=extract_all_pdfs_in_folder)
    extract_all_btn.pack(pady=10)

    root.mainloop()

if __name__ == "__main__":
    main()

Exception in Tkinter callback
Traceback (most recent call last):
  File "C:\Users\user\anaconda3\Lib\tkinter\__init__.py", line 1948, in __call__
    return self.func(*args)
           ^^^^^^^^^^^^^^^^
  File "C:\Users\user\AppData\Local\Temp\ipykernel_10420\3159671854.py", line 67, in extract_all_pdfs_in_folder
    process_pdf(pdf_file, output_folder, output_prefix, keyword, not_found_folder)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_10420\3159671854.py", line 43, in process_pdf
    extract_pages(pdf_path, 1, end_page, output_folder, output_prefix)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_10420\3159671854.py", line 30, in extract_pages
    doc.select(range(start_page - 1, end_page))  # page range
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\user\anaconda3\Lib\site-packages\fitz\__init__.py", line 5418, in select
    retainpages(pdf, pyliste)
  File "C:\Users\user\anaconda3\Lib\site-packages\fitz\__init__.py", line 20962, in retainpages
    if str

#### 키워드포함 페이지 ~ 끝

In [None]:
import tkinter as tk
from tkinter import filedialog, simpledialog
import fitz  # PyMuPDF
import glob
import os

def find_keyword_start_page(pdf_path, keyword):
    doc = fitz.open(pdf_path)
    start_page = None
    for i, page in enumerate(doc):
        text = page.get_text()
        if keyword in text:
            start_page = i+1
            break
    doc.close()
    return start_page

def extract_pages(pdf_path, start_page, end_page, output_folder, output_prefix):
    doc = fitz.open(pdf_path)
    fname = os.path.splitext(os.path.basename(pdf_path))[0]
    output_filename = os.path.join(output_folder, f'{fname}_{output_prefix}.pdf')
    doc.select(range(start_page - 1, end_page))  # page range
    doc.save(output_filename)
    doc.close()
    print(f'Created: {output_filename}')
    
def process_pdf(pdf_path, output_folder, output_prefix, keyword):
    start_page = find_keyword_start_page(pdf_path, keyword)
    if start_page:
        doc = fitz.open(pdf_path)
        end_page = len(doc)  # 문서의 총 페이지 수
        extract_pages(pdf_path, start_page, end_page, output_folder, output_prefix)
        print(f'Keyword "{keyword}" found in {pdf_path}')
        doc.close()
    else:
        print(f'Keyword "{keyword}" not found in {pdf_path}')

def extract_single_pdf():
    pdf_path = filedialog.askopenfilename(title="Select PDF file", filetypes=[("PDF files", "*.pdf")])
    if pdf_path:
        output_folder = filedialog.askdirectory(title="Folder to save")
        if output_folder:
            output_prefix = simpledialog.askstring("Input", "suffix of output file name:")
            keyword = simpledialog.askstring("Input", "Keyword :")
            if output_prefix and keyword:
                process_pdf(pdf_path, output_folder, output_prefix, keyword)

def extract_all_pdfs_in_folder():
    folder_path = filedialog.askdirectory(title="Select PDF file")
    if folder_path:
        output_folder = filedialog.askdirectory(title="Folder to save")
        if output_folder:
            output_prefix = simpledialog.askstring("Input", "suffix of output file name:")
            keyword = simpledialog.askstring("Input", "Keyword to find:")
            if output_prefix and keyword:
                pdf_files = glob.glob(os.path.join(folder_path, '*.pdf'))
                for pdf_file in pdf_files:
                    process_pdf(pdf_file, output_folder, output_prefix, keyword)

def main():
    root = tk.Tk()
    root.title("PDF Keyword Page Extractor with PyMuPDF")

    extract_single_btn = tk.Button(root, text="Extract Pages from Single PDF by Keyword", command=extract_single_pdf)
    extract_single_btn.pack(pady=10)

    extract_all_btn = tk.Button(root, text="Extract Pages from All PDFs in Folder by Keyword", command=extract_all_pdfs_in_folder)
    extract_all_btn.pack(pady=10)

    root.mainloop()

if __name__ == "__main__":
    main()

### Extraction code using page num

In [None]:
# import tkinter as tk
# from tkinter import filedialog, simpledialog
# from PyPDF2 import PdfReader, PdfWriter
# import glob
# import os

# def extract_pages(pdf_path, start_page, end_page, output_folder, output_prefix):
#     pdf_reader = PdfReader(pdf_path)
#     pdf_writer = PdfWriter()
#     fname = os.path.splitext(os.path.basename(pdf_path))[0]
#     output_filename = os.path.join(output_folder, f'{output_prefix}_{fname}.pdf')
#     for page in range(start_page - 1, end_page):
#         pdf_writer.add_page(pdf_reader.pages[page])
#     with open(output_filename, 'wb') as out_file:
#         pdf_writer.write(out_file)
#     print(f'Created: {output_filename}')

# def extract_single_pdf():
#     pdf_path = filedialog.askopenfilename(title="Select PDF file", filetypes=[("PDF files", "*.pdf")])
#     if pdf_path:
#         start_page = simpledialog.askinteger("Input", "Start page:", minvalue=1)
#         end_page = simpledialog.askinteger("Input", "End page:", minvalue=start_page)
#         output_folder = filedialog.askdirectory(title="Select Output Folder")
#         if output_folder:
#             output_prefix = simpledialog.askstring("Input", "Output file prefix:")
#             if start_page and end_page and output_prefix:
#                 extract_pages(pdf_path, start_page, end_page, output_folder, output_prefix)

# def extract_all_pdfs_in_folder():
#     folder_path = filedialog.askdirectory(title="Select Folder")
#     if folder_path:
#         start_page = simpledialog.askinteger("Input", "Start page:", minvalue=1)
#         end_page = simpledialog.askinteger("Input", "End page:", minvalue=start_page)
#         output_folder = filedialog.askdirectory(title="Select Output Folder")
#         if output_folder:
#             output_prefix = simpledialog.askstring("Input", "Output file prefix:")
#             if start_page and end_page and output_prefix is not None:
#                 extract_pages_from_all_pdfs(folder_path, start_page, end_page, output_folder, output_prefix)

# def extract_pages_from_all_pdfs(folder_path, start_page, end_page, output_folder, output_prefix):
#     pdf_files = glob.glob(os.path.join(folder_path, '*.pdf'))
#     for pdf_file in pdf_files:
#         extract_pages(pdf_file, start_page, end_page, output_folder, output_prefix)

# def main():
#     root = tk.Tk()
#     root.title("PDF Page Extractor")

#     extract_single_btn = tk.Button(root, text="Extract Pages from Single PDF", command=extract_single_pdf)
#     extract_single_btn.pack(pady=10)

#     extract_all_btn = tk.Button(root, text="Extract Pages from All PDFs in Folder", command=extract_all_pdfs_in_folder)
#     extract_all_btn.pack(pady=10)

#     root.mainloop()

# if __name__ == "__main__":
#     main()
