In [10]:
import fitz  # PyMuPDF
from PIL import Image
import pytesseract

# Tesseract OCR 경로 설정 (필요 시)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def pdf_to_images(pdf_path, output_folder, dpi=300):
    """
    PDF 파일을 이미지로 변환 및 저장
    """
    doc = fitz.open(pdf_path)
    image_paths = []
    file_nm = str(pdf_path).split("/")[-1].split(".")[0]

    for page_num in range(len(doc)):
        # 각 페이지를 이미지로 변환
        zoom = dpi / 72  # 기본 DPI는 72
        matrix = fitz.Matrix(zoom, zoom)
        pix = doc[page_num].get_pixmap(matrix=matrix)
        image_path = f"{output_folder}/{file_nm}_{page_num + 1}.png"
        pix.save(image_path)
        image_paths.append(image_path)

    return image_paths

def crop_image(image_path, coordinates, cropped_path):
    """
    이미지에서 특정 영역을 잘라내기
    coordinates: (x1, y1, x2, y2)
    """
    with Image.open(image_path) as img:
        cropped_img = img.crop(coordinates)
        cropped_img.save(cropped_path)
        return cropped_path

def perform_ocr(image_path):
    """
    OCR 수행
    """
    with Image.open(image_path) as img:
        text = pytesseract.image_to_string(img, lang="kor")  # 한국어는 "kor"로 변경
    return text

def get_image_size(image_path):
    """
    이미지 크기를 측정하는 함수
    :param image_path: 이미지 파일 경로
    :return: (너비, 높이) 튜플
    """
    with Image.open(image_path) as img:
        width, height = img.size  # (너비, 높이)
    return width, height

In [17]:
# PDF 슬라이싱 및 OCR 수행
pdf_path = "quiz_pdf/2024_B.pdf"
output_folder = "quiz_image"

doc_nm = str(pdf_path).split("/")[-1].split(".")[0]

# 1. PDF를 이미지로 변환
images = pdf_to_images(pdf_path, output_folder)
# 2. 특정 영역 잘라내기 (슬라이싱)
# 좌표는 PDF의 특정 영역에 따라 지정 (예: x1, y1, x2, y2)
coordinates_l = (0, 270, 1500, 4040)
coordinates_r = (1500, 270, 3000, 4040)# 예시 좌표

for idx, image in enumerate(images[1:]):
    cropped_image_path_l = crop_image(image, coordinates_l,
                                f"quiz_image_cropped/{doc_nm}_{idx + 1}_l.png")
    
    cropped_image_path_r = crop_image(image, coordinates_r,
                                      f"quiz_image_cropped/{doc_nm}_{idx + 1}_r.png")

In [9]:

# 3. OCR 수행
extracted_text = perform_ocr(cropped_image_path)
print("추출된 텍스트:", extracted_text)


TesseractError: (1, 'Error opening data file /opt/homebrew/share/tessdata/kor.traineddata Please make sure the TESSDATA_PREFIX environment variable is set to your "tessdata" directory. Failed loading language \'kor\' Tesseract couldn\'t load any languages! Could not initialize tesseract.')