In [None]:
import os

import cv2
import numpy as np
import pytesseract
from pdf2image import convert_from_bytes


class TextReport:
    """Class for textual reports handling"""

    def __init__(self, file_obj, lang):
        """Init Method for the class
        Args:
            file_obj (file object): File object of the PDF
            lang (str): Language of the PDF (fra for french, eng for english)
        """
        self.file_obj = file_obj
        self.lang = lang
        self.image_stack = []
        self.raw_text = ""
        self.text_as_list = []
        self.header_text = []
        self.results_match_dict = {}

    def get_grayscale(self, image):
        """Convert an image as numpy array to grayscale
        Args:
            image (numpy array): Image as numpy array
        Returns:
            image: image object
        """
        return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    def thresholding(self, image):
        """Treshold pixel of greyscale image
        Args:
            image (numpy array): Image as numpy array
        Returns:
            image: image object
        """
        return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

    def pdf_to_text(self):
        """Convert PDF file object from image to text using Tesseract with langage settings.
        OEM 1 PSM 1
        Returns:
            str: raw text as a string
        """
        self.image_stack = convert_from_bytes(self.file_obj.read())
        page_list = []
        # Loop on each image (page) of the PDF file
        for image in self.image_stack:
            open_cv_image = np.array(image)
            # Convert RGB to BGR
            open_cv_image = open_cv_image[:, :, ::-1].copy()
            # Preprocess image
            open_cv_image = self.thresholding(self.get_grayscale(open_cv_image))
            # Tesseract OCR
            custom_config = r"-l " + self.lang + r" --oem 1 --psm 1 "
            text_page = pytesseract.image_to_string(open_cv_image, config=custom_config)
            print(pytesseract.image_to_data(open_cv_image, config=custom_config))
            # print(pytesseract.image_to_data(open_cv_image, config=custom_config))
            # hocr_results = pytesseract.image_to_pdf_or_hocr(
            #     open_cv_image, config=custom_config, extension="hocr"
            # )
            # with open("hocr.html", "wb") as f:
            #     f.write(hocr_results)
            # Save text results
            page_list.append(text_page)
        self.raw_text = "\n".join(page_list)
        self.text_as_list = self.raw_text.split("\n")
        return self.raw_text

In [None]:
pdf_path = "GNE.pdf"
pdf_lang = sys.argv[2]
pdf_object = TextReport(open(pdf_path, "rb"), lang="fra")
a = pdf_object.pdf_to_text()