In [5]:
from PyPDF2 import PdfReader
from PIL import Image
import pytesseract
import fitz  # PyMuPDF
import os
import re  # Import the regex module

# Define the PDF path and Tesseract command path
pdf_path = "Data/test.pdf"
tesseract_cmd_path = '/usr/bin/tesseract'

# Set the Tesseract command path
pytesseract.pytesseract.tesseract_cmd = tesseract_cmd_path

def extract_afm(text):
    """Extract AFM from the provided text."""
    # Regex patterns
    afm_pattern_1 = r'\b\d{9}\b'  # 9 digits
    afm_pattern_2 = r'\b[A-Za-z]{2}\d{9,10}\b'  # 2 letters followed by 9 or 10 digits

    matches_1 = re.findall(afm_pattern_1, text)  # AFM (9 digits)
    matches_2 = re.findall(afm_pattern_2, text)  # AFM (2 letters + 9/10 digits)

    return matches_1 + matches_2  # Combine both matches

def extract_text_with_pypdf2(pdf_path):
    """Extract text from a PDF using PyPDF2."""
    pdf_text = ''
    try:
        reader = PdfReader(pdf_path)
        for page_num in range(len(reader.pages)):
            page_text = reader.pages[page_num].extract_text()
            if page_text:
                pdf_text += page_text + '\n'
    except Exception as e:
        print(f"Error reading PDF with PyPDF2: {e}")
    return pdf_text

def extract_text_with_ocr(pdf_path):
    """Extract text from a scanned PDF using OCR."""
    ocr_text = ''
    try:
        doc = fitz.open(pdf_path)
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            pix = page.get_pixmap()
            image_filename = f'TempPics/page_{page_num}.png'
            pix.save(image_filename)
            text = pytesseract.image_to_string(Image.open(image_filename), lang='eng+ell')
            ocr_text += text + '\n'
            os.remove(image_filename)
    except Exception as e:
        print(f"Error processing PDF with OCR: {e}")
    return ocr_text

# Extract text with PyPDF2
pdf_text = extract_text_with_pypdf2(pdf_path)
print("Text extracted with PyPDF2:")
print(pdf_text if pdf_text else "No text found.")

# Find AFM in PyPDF2 text
afm_from_pdf = extract_afm(pdf_text)
print("AFM(s) found in PyPDF2 text:", afm_from_pdf)

# Extract text  from scanned PDF
ocr_text = extract_text_with_ocr(pdf_path)
print("Text extracted with OCR:")
print(ocr_text if ocr_text else "No text found.")

# Find AFM in OCR text
afm_from_ocr = extract_afm(ocr_text)
print("AFM(s) found in OCR text:", afm_from_ocr)

# Combine all AFMs
#all_afm = afm_from_pdf + afm_from_ocr

# Remove duplicates if needed
#unique_afm = list(set(all_afm))

#print("All AFM(s) found:", all_afm )

Text extracted with PyPDF2:
Afm 151827762  
AFM 151827762  
Α.Φ.Μ 151827762  
ΑΦΜ 151827762  
Αφμ 15182776200  
GL000234567  
NL1234567892  
Us00000000  

AFM(s) found in PyPDF2 text: ['151827762', '151827762', '151827762', '151827762', 'GL000234567', 'NL1234567892']
Error processing PDF with OCR: name 'fimage_filename' is not defined
Text extracted with OCR:
No text found.
AFM(s) found in OCR text: []
