In [None]:
# Load required libraries

import zipfile
import numpy as np
import pandas as pd
# import pdfplumber
from dotenv import load_dotenv
import os
# import pytesseract as pyt
import fitz
import pymupdf
# from pdf2image import convert_from_path
# from PIL import Image, ImageEnhance, ImageFilter
# import cv2
# from tesserocr import PyTessBaseAPI, RIL
# import tesserocr
import json
import pprint
import re
from dotenv import find_dotenv
from dotenv import load_dotenv

# marker functions
from marker.converters.ocr import OCRConverter
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.config.parser import ConfigParser

In [None]:
import torch
torch.cuda.is_available()

In [None]:
testfile = "page1-cli-ocr-force-strip-chars"
directory = "/content/testing/" + testfile + ".pdf"

config = {
    "output_format": "json",
    "OCRJSONRenderer_extract_images": "True",
    "disable_image_extraction": "True",
    "DEBUG":"True",
    "force_ocr":"True",
    "strip_existing_ocr": "True"
}
config_parser = ConfigParser(config)
converter = OCRConverter(
    config=config_parser.generate_config_dict(),
    artifact_dict=create_model_dict(),
)
rendered = converter(directory)

output_filename = "/content/testing/" + "marker-" + testfile + ".json"
with open(output_filename, "w", encoding="utf-8") as f:
    f.write(rendered.model_dump_json())

In [None]:
# Test if individual file is repaired (before uploading to marker-pdf)
# quickcheckname = "textract_test"
# doc = pymupdf.open("./docs_to_write_on/"+ quickcheckname + ".pdf")
# print(doc.can_save_incrementally())
# doc.close()
# page=None

## Marker testing

In [None]:
# Read file-list to process

pdfolder = "./docs_to_write_on/"
jsonfolder = "./jsons_to_read/"
nameslist = set([filename[:-4] for filename in os.listdir(pdfolder)])   # read all files in directory and cut off extensions (keep file names only), set-type object for faster comparison later
print(nameslist)

# use this portion to opt in/out files to be run later

opt_out = {'page1test'}
nameslist = [name for name in nameslist if name not in opt_out]
print(nameslist)

#replace cutting off 'marker-' with regex
verificationset = set([filename[7:] for filename in [filename[:-5] for filename in os.listdir(jsonfolder)]]).symmetric_difference(nameslist)   # cut off marker- prefix and .json extension from json files and compare to namelist sets to verify no issues
if verificationset == opt_out:
    verificationset = set()
print(verificationset)

In [None]:
# Verify files before uploading to marker or opening with PyMuPDF

def check_ok_for_marker(filename, filedir):
    
    init_file = filedir + filename + ".pdf"
    doc = pymupdf.open(init_file)

    if not doc.can_save_incrementally():

        print(f'File: "{filename}" unwritable. Fixing...')

        new_file = filedir + filename + "-fixed" + ".pdf"
        doc.save(new_file, garbage=4,deflate=True)
        doc.close()
        page=None
        os.remove(init_file)
        os.rename(new_file,init_file)
    else:
        print('Nothing to fix!')

if len(verificationset)==0 :   # change this to check only files that are outside the verificationlist
    for pdf_file in nameslist:
        check_ok_for_marker(pdf_file,pdfolder)

In [None]:
renderedlist = []
for json_file in nameslist:
    with open(jsonfolder + "marker-" + json_file + ".json", "r", encoding="utf-8") as f:
        renderedlist.append(json.load(f)['children'])

for json_file in renderedlist:
  pprint.pprint(json_file, indent=2)
# print(renderedlist)

In [None]:
# Retrieve specific defined coordinates from JSON, "flatten" data

# Marker-defined categories (specifically leaves only) to extract from marker output JSON. Parse through the output JSON to find new leaf categories to add for extraction
leaf_categories_with_text = ['Line','Text','TextInlineMath','Caption','ListItem','SectionHeader','TableCell', 'PageFooter','PageHeader']
leaf_categories_to_retrieve = ['Picture','TableGroup','Equation', 'Figure', 'Handwriting']

# For data retrieved from parsing with the full marker PDF model. Recursive function to retrieve all coordinates and text of identified objects
def retrieve_specific_boxes_and_flatten(treelist,pull_polygons,pull_text):
  # recursive function to pick up all the coordinates, and all HTML text from specifically defined configurations of nested items
  coordinates = {}
  htmtext = {}
  
  for nitem,item in enumerate(treelist):
    newpoly = {}
    newtext = {}
    if (item['children']):
      # newpoly = {'size':[tuple(point) for point in item['polygon']]}
      newchildren = item['children']
      retr_coords,retr_text = retrieve_specific_boxes_and_flatten(newchildren,pull_polygons,pull_text)
      newpoly.update(retr_coords)
      newtext.update(retr_text)
      if item['block_type']=='Page':
        newitem_poly = {int(re.search(r"/page/(\d+)/", item['id']).group(1)):newpoly}
        newitem_text = {int(re.search(r"/page/(\d+)/", item['id']).group(1)):newtext}
      else: 
        newitem_poly = newpoly
        newitem_text = newtext
    else:
      if (item['block_type'] in pull_polygons) or (item['block_type'] in pull_text) :
        newpoly = item['polygon']
        newpoly = [tuple(point) for point in newpoly]
        newitem_poly = {item['id']:newpoly}
        if item['block_type'] in pull_text:
          newtext = item['html']
          newitem_text = {item['id']:newtext}
        else:
          newitem_text = {}
      else: 
        newitem_poly = {}
        newitem_text = {}
    coordinates.update(newitem_poly)
    htmtext.update(newitem_text)
  return (coordinates,htmtext)

# For data retrieved from parsing with the OCR-ONLY marker PDF model with the --keep-chars flag active. Recursive function to retrieve all coordinates and 
# text of identified objects. The difference from the full model function are the 'Char' objects. They do not have a 'children' attribute and the retrieval throws an error
# (could technically only use this function but I like to have them differentiated)
def ocr_retrieve_specific_boxes_and_flatten(treelist,pull_polygons,pull_text):
  # recursive function to pick up all the coordinates from specifically defined configurations of nested items
  coordinates = {}
  htmtext = {}
  
  for nitem,item in enumerate(treelist):
    newpoly = {}
    newtext = {}
    if (item['block_type'] == 'Char'):
        newpoly = item['polygon']
        newpoly = [tuple(point) for point in newpoly]
        newitem_poly = {item['id']:newpoly}
        newtext = item['text']
        newitem_text = {item['id']:newtext}
    else:
        if (item['children']):
            # newpoly = {'size':[tuple(point) for point in item['polygon']]}
            newchildren = item['children']
            retr_coords,retr_text = ocr_retrieve_specific_boxes_and_flatten(newchildren,pull_polygons,pull_text)
            newpoly.update(retr_coords)
            newtext.update(retr_text)
            if item['block_type']=='Page':
                newitem_poly = {int(re.search(r"/page/(\d+)/", item['id']).group(1)):newpoly}
                newitem_text = {int(re.search(r"/page/(\d+)/", item['id']).group(1)):newtext}
            else: 
                newitem_poly = newpoly
                newitem_text = newtext
        else:
            if (item['block_type'] in pull_polygons) or (item['block_type'] in pull_text) :
                newpoly = item['polygon']
                newpoly = [tuple(point) for point in newpoly]
                newitem_poly = {item['id']:newpoly}
                if item['block_type'] in pull_text:
                    newtext = item['html']
                    newitem_text = {item['id']:newtext}
                else:
                    newitem_text = {}
            else: 
                newitem_poly = {}
                newitem_text = {}
    coordinates.update(newitem_poly)
    htmtext.update(newitem_text)
  return (coordinates,htmtext)



renderedboxes = []
renderedtext = []

for name,red_lines in zip(nameslist,renderedlist):
  if re.search('(chars)', name):
    all_boxes,all_text = ocr_retrieve_specific_boxes_and_flatten(red_lines,leaf_categories_to_retrieve,leaf_categories_with_text)
  else:
    all_boxes,all_text = retrieve_specific_boxes_and_flatten(red_lines,leaf_categories_to_retrieve,leaf_categories_with_text)


  renderedboxes.append(all_boxes)
  renderedtext.append(all_text) 

for boxes,texts in zip(renderedboxes,renderedtext):
   display(boxes)
   display(texts)



###############################################################################################################

# old JSON processing function, workd for simple OCR model, fails for Full model

# def retrieve_all_boxes(treelist):
#   # recursive function to pick up all the coordinates from any configuration of nested items
#   newdict = {}
#   # location = location + " " + str(nitem)
#   # all_boxes.update({location:item['polygon']})
#   for nitem,item in enumerate(treelist):
#     if (item['children']):
#       newpoly = {'size':[tuple(point) for point in item['polygon']]}
#       newchildren = item['children']
#       newpoly.update(retrieve_all_boxes(newchildren))
#       if item['block_type']=='Page':
#         newid = int(re.search(r"/page/(\d+)/", item['id']).group(1))
#       else: newid = item['id']
#     else:
#       newpoly = item['polygon']
#       newpoly = [tuple(point) for point in newpoly]
#       newid = item['id']
   
#     newdict.update({newid:newpoly})
#   return newdict

# all_boxes = retrieve_all_boxes(red_lines)
# display(all_boxes)

In [None]:
# write coordinate boxes to PDF


def add_boxes(filename, filedir, annotations):

  # open current document
  init_file = filedir + filename + ".pdf"
  doc = pymupdf.open(init_file)

  # check file health and save as new file (fixes marker-pdf PIL errors and PyMuPDF 'code=4' errors)
  if not doc.can_save_incrementally():
    print(f'File: "{filename}" unwritable. Fixing...')
    # filename = filename + "-fixed"
    new_file = filedir + filename + "-fixed" + ".pdf"
    doc.save(new_file, garbage=4,deflate=True)
    doc.close()
    page=None
    os.remove(init_file)
    os.rename(new_file,init_file)
    doc = pymupdf.open(init_file)
  
  for i,page in enumerate(doc):
    lines = annotations[i]
    nannots = len(lines)
    for ncoord,coords in enumerate(lines.items()):
      print(f"Page {i}: Adding annotation {ncoord} / {nannots}",end='\r')
      annot = page.add_polygon_annot(coords[1])
      annot.set_colors(stroke=(0.416, 0.416, 1))
      annot.update()
      doc.saveIncr()
  doc.close()
  page=None
  print(f'"{filename}" annotated.')


def print_retrieved_text(texttoprint):
  for item in texttoprint.values():
    for textblock in item.values():
      print(re.sub( "<(\w+)([\s\-\w='\":/\.]+)?>" ,"",re.sub( "<(/\w+)([\s\w=':/\.]+)?>" ,"",textblock)))

for name,boxes,texts in zip(nameslist,renderedboxes,renderedtext):
  add_boxes(name, pdfolder, boxes)
  print_retrieved_text(texts)

-----

## Kofax: Pulling signatures from processed PDF 
(using Kofax, transform to Word and then back to PDF)

In [None]:
import re, fitz, unicodedata

# ---- helpers ---- 
def norm(s):
    s = unicodedata.normalize('NFKC', str(s))
    return s.replace('\u00B7','.')  # middle dot → '.'

# τίτλοι που ενεργοποιούν Wipe-Βand
HEAD_TRIGGERS = [
  'ΟΙ ΣΥΜΒΑΛΛΟΜΕΝΟΙ', 'Ο ΣΥΜΒΑΛΛΟΜΕΝΟΣ', 'Η ΣΥΜΒΑΛΛΟΜΕΝΗ',
  'Ο ΣΥΝΕΤΑΙΡΟΣ ΠΙΣΤΟΥΧΟΣ', 'ΟΙ ΣΥΝΟΦΕΙΛΕΤΗΣ', 'Ο/ΟΙ ΣΥΝΟΦΕΙΛΕΤΗΣ/ΕΣ',
  'Ο/ΟΙ ΕΓΓΥΗΤΗΣ/ΕΣ', 'Ο ΕΓΓΥΗΤΗΣ', 'Ο ΟΦΕΙΛΕΤΗΣ', 'Ο/ΟΙ ΟΦΕΙΛΕΤΗΣ/ΕΣ'
]

def text_blocks(page):
    # rawdict → blocks (text/images) με bbox
    data = page.get_text('rawdict')
    return data.get('blocks', []) if isinstance(data, dict) else []

def find_heading_band(page, side_margin=10, top_offset=6):
    # ψάξε αν υπάρχει τίτλος-κλειδί στη σελίδα και γύρνα ζώνη (x0,y0,x1,y1)
    W,H = page.rect.width, page.rect.height
    best_y = None
    for b in text_blocks(page):
        if b.get('type') != 0: continue  # μόνο text blocks
        for l in b.get('lines', []):
            line_text = ' '.join([s.get('text','') for s in l.get('spans',[])])
            T = norm(line_text).upper().strip()
            for key in HEAD_TRIGGERS:
                if key in T:
                    # πάρε bbox της γραμμής
                    xs=[]; ys=[]
                    for s in l.get('spans', []):
                        (x0,y0,x1,y1) = s.get('bbox', (None,None,None,None))
                        if x0 is None: continue
                        xs += [x0,x1]; ys += [y0,y1]
                    if xs and ys:
                        y_line = max(ys)  # κάτω άκρη γραμμής
                        best_y = y_line if best_y is None else min(best_y, y_line)
    if best_y is None: return None
    x0 = page.rect.x0 + side_margin
    y0 = best_y + top_offset
    x1 = page.rect.x1 - side_margin
    y1 = page.rect.y1 - 8
    return (x0,y0,x1,y1) if y0 < y1 else None

def image_boxes_in_band(page, band=None, bottom_ratio=0.35):
    boxes = []
    W,H = page.rect.width, page.rect.height
    bottom_y = H*(1.0-bottom_ratio)
    for b in text_blocks(page):
        if b.get('type') == 1:  # image
            (x0,y0,x1,y1) = b.get('bbox', (None,None,None,None))
            if x0 is None: continue
            # κριτήρια: (i) τέμνει band ή (ii) βρίσκεται στο κάτω Χ% της σελίδας
            in_bottom = (y0 >= bottom_y) or (y1 >= bottom_y)
            intersects_band = False
            if band is not None:
                bx0,by0,bx1,by1 = band
                intersects_band = not (x1<bx0 or x0>bx1 or y1<by0 or y0>by1)
            if intersects_band or in_bottom:
                boxes.append((x0,y0,x1,y1))
    return boxes

def redact_pdf(input_pdf, output_pdf, do_wipe=True, do_images=True, side_margin=10, top_offset=6, bottom_ratio=0.35, fill='white', draw_labels=False):
    rgb=(1,1,1) if str(fill).lower()=='white' else (0,0,0)
    doc = fitz.open(input_pdf)
    for pno in range(len(doc)):
        page = doc[pno]
        band = find_heading_band(page, side_margin=side_margin, top_offset=top_offset) if do_wipe else None
        if band:
            r = fitz.Rect(band)
            page.add_redact_annot(r, text=('[SIGN-BAND]' if draw_labels else None), fill=rgb)
        if do_images:
            for (x0,y0,x1,y1) in image_boxes_in_band(page, band=band, bottom_ratio=bottom_ratio):
                r = fitz.Rect(x0,y0,x1,y1)
                page.add_redact_annot(r, text=('[IMG]' if draw_labels else None), fill=rgb)
        page.apply_redactions()
    doc.save(output_pdf, garbage=4, deflate=True)
    doc.close()


redact_pdf('test_text_and_signaturesdoc.pdf',"out_signzones.pdf")

-----

## Azure: DocumentIntelligence for OCR

and integrating an NER model on the text

In [None]:
load_dotenv(dotenv_path='./config/.env')

AZURE_ENDPOINT = os.getenv('AZURE_ENDPOINT')
AZURE_KEY = os.getenv('AZURE_KEY')

In [None]:
AZURE_CONFIG = {
    "endpoint" : AZURE_ENDPOINT,
    "key" : AZURE_KEY
}

-----

## TesserOCR: Testing library to get coordinates

After pre-OCR-ing with Tungsten/Kofax PowerPDF

In [None]:
# Standard Tesseract + output images (i think)

# Define Tesseract exe location
pyt.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
pyt.get_languages()

current_file = "testtest.pdf"



def ocr_scanned_pdf(pdf_path):
    text = ""
    images = convert_from_path(pdf_path)    
    for i, image in enumerate(images):
        page_text = pyt.image_to_string(image, lang="ell",config="--psm 4 --oem 3 -c tessedit_write_images=true")      
        print(f"OCR result for page {i+1}: {page_text}")
        # text += f"--- Page {i+1} ---\n{page_text}\n\n"
    return text


ocr_scanned_pdf(current_file)

In [None]:
# Rewrite initial file to contain all coordinates of identified words


insertions = []

# open document
input_pdf = "testtest.pdf"
doc = fitz.open(input_pdf)

# insert here transform pdf to image
images = convert_from_path(input_pdf)
print(images)


def transform_coords(box):
    # assume of dict format with {'x':x,'y':y,'w':w,'h':h}
    x1 = box['x']
    y1 = box['y']
    w = box['w']
    h = box['h']
    x2 = x1+w
    y2 = y1+h

    # with polygon annotation:
    coordinates_pdf = [(x1*scale_x - 2,y1*scale_y - 2),(x2*scale_x + 2,y1*scale_y - 2), (x2*scale_x + 2,y2*scale_y + 2),(x1*scale_x - 2,y2*scale_y + 2)]

    # with redact annotation:


    return coordinates_pdf


for j,image in enumerate(images):

    insertions_temp_dict = {}

    with PyTessBaseAPI(path=r'C:\Program Files\Tesseract-OCR\tessdata', lang='ell') as api:


        api.SetImage(image)

        # get all sizes
        currentpage = j
        imgsize = image.size
        page = doc[currentpage]
        scale_x = page.rect.width / imgsize[0]
        scale_y = page.rect.height / imgsize[1]

        boxes = api.GetComponentImages(RIL.TEXTLINE, True)
        total_items = len(boxes)
        print('Found {} textline image components.'.format(total_items))
        for i, (im, box, _, _) in enumerate(boxes):

            api.SetRectangle(box['x'], box['y'], box['w'], box['h'])
            ocrResult = api.GetUTF8Text()
            conf = api.MeanTextConf()

            # create box visually on PDF and add embellishments
            pdf_transformed_coordinates = transform_coords(box)
            # print(pdf_transformed_coordinates)
            print(pdf_transformed_coordinates)
            annot = page.add_polygon_annot(pdf_transformed_coordinates)
            annot.set_colors(stroke=(0.416, 0.416, 1))
            annot.update()

            # # add text for visual representation
            # text_rect = fitz.Rect(pdf_transformed_coordinates[0][0],pdf_transformed_coordinates[0][1],pdf_transformed_coordinates[2][0],pdf_transformed_coordinates[2][1])
            # # text_rect = fitz.Rect(100,100,100,100)
            # page.insert_textbox(
            #     text_rect,
            #     ocrResult,
            #     fontsize=3,
            #     fontname="helv",
            #     color=(0, 0, 0),   # black text
            #     align=1            # center align
            # )

            # insertions_temp_dict.update({i:inserted})
            doc.saveIncr()
            print(f"\rProgress: {i+1}/{total_items}","\r",end="")
    insertions.append(insertions_temp_dict)




doc.close()
page = None

-----

## Utilities

In [None]:
# Redact specific coordinates

def redact_pdf(filename,filedir, coords):
    init_file = filedir + filename + ".pdf"
    doc = fitz.open(init_file)
    for page in doc:
        print(page.rect.width,page.rect.height)
        page.add_redact_annot(coords, fill=(0, 0, 0))
    doc.save("output1.pdf")
    doc.close()


def annotate_specific_coords(filename,filedir,coords):
    init_file = filedir + filename + ".pdf"
    doc = pymupdf.open(init_file)
    page = doc[0]
    annot = page.add_polygon_annot(coords)
    annot.set_colors(stroke=(0.416, 0.416, 1))
    annot.update()
    doc.saveIncr()
    doc.close()
    page=None



specific_annot = [ (59.981864717805415, 287.13748905389093),
                                 (278.1658976288226, 287.13748905389093),
                                 (278.1658976288226, 295.38425766901577),
                                 (59.981864717805415, 295.38425766901577)]

annotate_specific_coords("page1test","./docs_to_write_on/",specific_annot)

In [None]:
# Quickly turn PDF to images

pages = convert_from_path(current_file)
for count, page in enumerate(pages):
    page.save(f'out{count}.jpg', 'JPEG')

In [None]:
# PDF/Image pre-processing to improve Tesseract results





# img = Image.open("./testactual_page-0001.jpg")
# print(img)
# osd = pyt.image_to_osd(img,output_type="dict")
# print(osd)


img = cv2.imread("./out0.jpg")
img = cv2.resize(img,(0,0),fx=7,fy=7)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (3,3), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
invert = 255 - opening

# im = img.filter(ImageFilter.MedianFilter())
# enhancer = ImageEnhance.Contrast(im)
# im = enhancer.enhance(2)
# im = im.convert('1')
# im.save('temp2.jpg')

osd = pyt.image_to_osd(invert,output_type="dict")
print(osd)

# cv2.imshow('thresh', thresh)
# cv2.imshow('opening', opening)
# cv2.imshow('invert', invert)
# cv2.waitKey()

page_text = pyt.image_to_string(invert, lang="Greek",config="--psm 1 -c tessedit_ocr_engine_mode=1")
print(f"OCR result for page 1: {page_text}")

In [None]:
# Break PDF into one file per page (for troubleshooting marker)

import fitz  # PyMuPDF
import os


goesin = "base_test.pdf"
comesout = "split_pdf"

def split_pdf(input_pdf, output_dir):
    # Open the source PDF
    doc = fitz.open(input_pdf)
    
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Iterate over all pages
    for page_num in range(len(doc)):
        # Create a new PDF for each page
        new_doc = fitz.open()
        new_doc.insert_pdf(doc, from_page=page_num, to_page=page_num)

        # Save with page number in filename (1-indexed for readability)
        output_path = os.path.join(output_dir, f"page_{page_num+1}.pdf")
        new_doc.save(output_path)
        new_doc.close()

        print(f"Saved {output_path}")

    doc.close()


# Example usage:
split_pdf(goesin, comesout)
