In [18]:
import difflib
import pymupdf
from multiprocessing import Pool, cpu_count
#import pymupdf4llm #THIS LIBRARY MIGHT BE USEFUL FOR TALKING WITH THE DATA OPTION


In [32]:
class PdfDoc():
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path
        self.doc = pymupdf.open(self.pdf_path)
        self.pages = [page for page in self.doc]
        
    def get_text_in_page(self, page_num):
        return self.pages[page_num].get_textpage()
    
    def get_blocks_in_page(self, page_num, block_num = None):
        if block_num is None:
            return self.pages[page_num].get_text("blocks", sort=True)
        else:
            return self.pages[page_num].get_text("blocks", sort=True)[block_num]
        
    def get_page(self, page_num):
        return self.text_pages[page_num]
    
    def get_words_in_page(self, page_num):
        words = self.pages[page_num].get_text("words", sort=True)
        return self.correct_line_numbers(words)
        
    def correct_line_numbers(self, words):
        previous_block_no = -1
        previous_y0 = -1
        previous_y1 = -1
        word_no = 0
        for index, word in enumerate(words):
            current_block_no, current_y0, current_y1 = word[5], word[1], word[3]
            if current_block_no == previous_block_no and current_y0 == previous_y0 and current_y1 == previous_y1:
                line_no = 0
                word_no += 1
            else:
                line_no = word[6]
                word_no = word[7]
            previous_block_no, previous_y0, previous_y1 = current_block_no, current_y0, current_y1
            words[index] = (*word[:6], line_no, word_no)
        return words
    
class PdfOperations:
    def __init__(self, file_names):
        if not isinstance(file_names, list):
            raise ValueError("file_names must be a list of strings representing file paths")
        self.pdf_docs = [PdfDoc(file_name) for file_name in file_names]

    @staticmethod
    def combine_coordinates(coords_list):
        if not coords_list:
            return None
        x0 = min(coord[0] for coord in coords_list)
        y0 = min(coord[1] for coord in coords_list)
        x1 = max(coord[2] for coord in coords_list)
        y1 = max(coord[3] for coord in coords_list)
        return (x0, y0, x1, y1)
    
    @staticmethod
    def get_coordinates(words_info, block_num, start_index, end_index):
        return [word[:4] for word in words_info if word[5] == block_num and start_index <= word[7] < end_index]

    def find_replacements(self, page_dict, block_num):
        
        text_A = page_dict["blocks_A"][block_num][4].strip()
        text_B = page_dict["blocks_B"][block_num][4].strip()
        
        s = difflib.SequenceMatcher(None, text_A.split(), text_B.split())
        opcodes = s.get_opcodes()
        
        replacements = []
        for tag, i1, i2, j1, j2 in opcodes:
            if tag == 'equal':
                continue
            replacement = {'tag': tag}
            
            words_info_A = page_dict["words_A"]
            words_info_B = page_dict["words_B"]
            
            str_start, str_end, words_info = (i1, i2, words_info_A) if tag != 'insert' else (j1, j2, words_info_B)
            
            if tag == 'replace':
                replacement['old_text'] = ' '.join(text_A.split()[i1:i2])
                replacement['new_text'] = ' '.join(text_B.split()[j1:j2])
            elif tag == 'delete':
                replacement['removed_text'] = ' '.join(text_A.split()[i1:i2])
            elif tag == 'insert':
                replacement['inserted_text'] = ' '.join(text_B.split()[j1:j2])
            coordinates = self.get_coordinates(words_info, block_num, str_start, str_end)
            if coordinates:
                if tag == 'replace':
                    replacement['coordinates_A'] = self.combine_coordinates(coordinates)
                    coordinates_B = self.get_coordinates(words_info_B, block_num, j1, j2)
                    if coordinates_B:
                        replacement['coordinates_B'] = self.combine_coordinates(coordinates_B)
                else:
                    replacement['coordinates'] = self.combine_coordinates(coordinates)
            replacements.append(replacement)
        return replacements

    def compare_files(self, id_pdfA, id_pdfB):
        # Method for comparing two pdf documents
        # Input:    Takes in two pdfs ids to be compared, these ids are relative to order in file_names list
        #           The two pdf files will be adressed as A and B, such that differences are reflected from
        #               what differs from B relative to A.
        # Returns:  changes_by_page, dictionary
        changes_by_page = {}
        
        # Max of pages between both documents
        max_pages = max(len(self.pdf_docs[id_pdfA].pages), len(self.pdf_docs[id_pdfB].pages))
        
        for page_num in range(max_pages):

            page_dict = {'blocks_A' : self.pdf_docs[id_pdfA].get_blocks_in_page(page_num),
                         'blocks_B' : self.pdf_docs[id_pdfB].get_blocks_in_page(page_num),
                         'words_A' : self.pdf_docs[id_pdfA].get_words_in_page(page_num),
                         'words_B' : self.pdf_docs[id_pdfB].get_words_in_page(page_num),
                         }
            
            max_blocks = max(len(page_dict['blocks_A']), len(page_dict['blocks_B']))
            for block_num in range(max_blocks):
                
                replacements = self.find_replacements(page_dict, block_num)
                
                if replacements:
                    if page_num not in changes_by_page:
                        changes_by_page[page_num] = {}
                    changes_by_page[page_num][block_num] = replacements
        return changes_by_page

    def apply_changes(self, id_pdfA, id_pdfB):
        for page_num, changes_in_page in changes.items():
            for block_num, changes_in_block in changes_in_page.items():
                for change in changes_in_block:
                    if change['tag'] == 'insert':
                        block_bbox = self.pdf_docs[id_pdfB].get_blocks_in_page(page_num)[block_num][:4]
                        annot = self.pdf_docs[id_pdfB].pages[page_num].add_highlight_annot(block_bbox)
                        annot.set_colors(stroke=(0, 1, 0))  # change the color to green
                        annot.update()
                    elif change['tag'] == 'delete':
                        block_bbox = self.pdf_docs[id_pdfA].get_blocks_in_page(page_num)[block_num][:4]
                        annot = self.pdf_docs[id_pdfA].pages[page_num].add_highlight_annot(block_bbox)
                        annot.set_colors(stroke=(1, 0, 0))  # change the color to red
                        annot.update()
                    else: # 'replace'
                        block_bbox_A = self.pdf_docs[id_pdfA].get_blocks_in_page(page_num)[block_num][:4]
                        block_bbox_B = self.pdf_docs[id_pdfB].get_blocks_in_page(page_num)[block_num][:4]
                        annot_A = self.pdf_docs[id_pdfA].pages[page_num].add_highlight_annot(block_bbox_A)
                        annot_B = self.pdf_docs[id_pdfB].pages[page_num].add_highlight_annot(block_bbox_B)
                        annot_A.set_colors(stroke=(1, 0.749, 0))  # change the color to ambar
                        annot_B.set_colors(stroke=(1, 0.749, 0))  # change the color to ambar
                        annot.update()

    def save_altered_files(self, output_path_A, output_path_B):
        self.pdf_docs[0].doc.save(output_path_A, garbage=4, deflate=True, clean=True)
        self.pdf_docs[1].doc.save(output_path_B, garbage=4, deflate=True, clean=True)
       
    @staticmethod
    def compare_block_with_others(args):
        block_A, threshold_similarity = args
        match_val = 0
        match_block_B = None
        for (page_num_B, block_B) in flat_blocks_B:
            similarity = difflib.SequenceMatcher(None, block_A[1][4], block_B[4]).ratio()
            if similarity > match_val and similarity >= threshold_similarity:
                match_val = similarity
                match_block_B = (page_num_B, block_B)
        if match_block_B is not None:
            return ((block_A[0], block_A[1][-2]), (match_block_B[0], match_block_B[1][-2]))

    def match_blocks(self, id_pdfA, id_pdfB, threshold_similarity=0.75):
        global flat_blocks_B
        flat_blocks_A = [(page_num, block) for page_num in range(len(self.pdf_docs[id_pdfA].pages)) for block in self.pdf_docs[id_pdfA].get_blocks_in_page(page_num)]
        flat_blocks_B = [(page_num, block) for page_num in range(len(self.pdf_docs[id_pdfB].pages)) for block in self.pdf_docs[id_pdfB].get_blocks_in_page(page_num)]

        with Pool(cpu_count()) as p:
            match_list = p.map(pdf_operations.compare_block_with_others, [(block_A, threshold_similarity) for block_A in flat_blocks_A])

        return [match for match in match_list if match is not None]
            
# Example usage
file_names = ['../data/v1.pdf', '../data/v2.pdf', '../data/story_v3.pdf', '../data/story_v4.pdf',]
output_paths = [item.replace(".pdf", "_altered.pdf") for item in file_names]

pdf_operations = PdfOperations(file_names)
#matches = pdf_operations.match_blocks(3, 4)


changes = pdf_operations.compare_files(3,4)
#pdf_operations.apply_changes(0,1)
#pdf_operations.save_altered_files(output_paths[0],output_paths[1])

#changes

IndexError: list index out of range

In [33]:
changes

{0: {0: [{'tag': 'delete',
    'removed_text': 'RISK MANAGEMENT',
    'coordinates': (127.50199890136719,
     40.74897003173828,
     306.2960205078125,
     65.53497314453125)}],
  1: [{'tag': 'insert',
    'inserted_text': 'HERE',
    'coordinates': (180.3986053466797,
     103.24065399169922,
     213.67364501953125,
     119.81221008300781)}],
  8: [{'tag': 'replace',
    'old_text': 'OVERVIEW',
    'new_text': 'UNTS',
    'coordinates_A': (89.33897399902344,
     249.2246551513672,
     154.0903778076172,
     265.7962341308594),
    'coordinates_B': (89.33897399902344,
     249.2246551513672,
     121.90652465820312,
     265.7962341308594)}],
  25: [{'tag': 'replace',
    'old_text': '$291,125,587',
    'new_text': '$27',
    'coordinates_A': (258.0090026855469,
     598.238525390625,
     318.7754211425781,
     611.98681640625),
    'coordinates_B': (258.0090026855469,
     598.238525390625,
     274.57891845703125,
     611.98681640625)}]},
 1: {1: [{'tag': 'delete',
    're

In [None]:
page = pdf_operations.pdf_docs[0].pages[1] # let's take the first page
pix = page.get_pixmap()
pix.save("page_image1.png")  #

IDEAS

In [None]:
def find_match_blocks(self, id_pdfA, id_pdfB):
        # Flatten the blocks and pair them by page number
        flat_blocks_A = [(page_num, block) for page_num, page_blocks in enumerate(self.pdf_docs[id_pdfA].blocks_per_page) for block in page_blocks]
        flat_blocks_B = [(page_num, block) for page_num, page_blocks in enumerate(self.pdf_docs[id_pdfB].blocks_per_page) for block in page_blocks]

        threshold_similarity = 0.75
        match_list = []

        # First pass: compare blocks side by side and filter out highly similar blocks
        prefiltered_blocks_A = []
        prefiltered_blocks_B = []
        for (block_info_A, block_info_B) in zip(flat_blocks_A, flat_blocks_B):
            page_num_A, block_A = block_info_A
            page_num_B, block_B = block_info_B
            similarity = difflib.SequenceMatcher(None, block_A[4], block_B[4]).ratio()
            if similarity < threshold_similarity:
                prefiltered_blocks_A.append(block_info_A)
                prefiltered_blocks_B.append(block_info_B)

        # Second pass: compare each remaining block from pdf_doc1 with each block from pdf_doc2
        for (page_num_A, block_A) in prefiltered_blocks_A:
            match_val = 0
            match_block_B = None
            for (page_num_B, block_B) in prefiltered_blocks_B:
                similarity = difflib.SequenceMatcher(None, block_A[4], block_B[4]).ratio()
                if similarity > match_val:
                    match_val = similarity
                    match_block_B = (page_num_B, block_B)

            # If a match is found, add it to the match_list
            if match_block_B is not None:
                match_list.append(((page_num_A, block_A[-2]), (match_block_B[0], match_block_B[1][-2])))

        return match_list   


In [None]:
page_number = 0 # First page of the document
page = pdf_doc2.doc[page_number]

# Define the rectangle coordinates
rectangle_coords = (258.0090026855469,
     598.238525390625,
     274.57891845703125,
     611.98681640625)

clip_rect = pymupdf.Rect(rectangle_coords)
    
    # Clip the page to the rectangle and extract text
text = page.get_text(clip=clip_rect)

text

In [None]:
import difflib

# Assuming pdf_doc1.blocks_per_page and pdf_doc2.blocks_per_page are lists of pages
flat_blocks_A = [(page_num, block) for page_num, page_blocks in enumerate(pdf_doc1.blocks_per_page) for block in page_blocks]
flat_blocks_B = [(page_num, block) for page_num, page_blocks in enumerate(pdf_doc2.blocks_per_page) for block in page_blocks]

threshold_similarity = 0.75
match_list = []

# Compare each block from pdf_doc1 with each block from pdf_doc2
for (page_num_A, block_A) in flat_blocks_A:
    match_val = 0
    match_block_B = None
    for (page_num_B, block_B) in flat_blocks_B:
        similarity = difflib.SequenceMatcher(None, block_A[4], block_B[4]).ratio()
        if similarity > match_val and similarity >= threshold_similarity:
            match_val = similarity
            match_block_B = (page_num_B, block_B)

    # If a match is found, add it to the match_list
    if match_block_B is not None:
        match_list.append(((page_num_A, block_A[-2]), (match_block_B[0], match_block_B[1][-2])))


####

for index, ((page_A, block_index_A), (page_B, block_index_B)) in enumerate(match_list):
    # Find the block in flat_blocks_A that matches the page number and block index
    block_content_A = next((block for page_num, block in flat_blocks_A if page_num == page_A and block[-2] == block_index_A), None)
    # Find the block in flat_blocks_B that matches the page number and block index
    block_content_B = next((block for page_num, block in flat_blocks_B if page_num == page_B and block[-2] == block_index_B), None)
    
    # Assuming block_content_A and block_content_B are not None and contain the text content at index 4
    if block_content_A is not None and block_content_B is not None:
        print(f"Match {index}: Page {page_A} Block {block_index_A} Content: {block_content_A[4]} with Page {page_B} Block {block_index_B} Content: {block_content_B[4]}")
similarity_matrix = np.zeros((len(flat_blocks_A), len(flat_blocks_B)))

# Compute similarities and store them in the matrix
for i, (_, block_A) in enumerate(flat_blocks_A):
    for j, (_, block_B) in enumerate(flat_blocks_B):
        similarity_matrix[i, j] = difflib.SequenceMatcher(None, block_A[4], block_B[4]).ratio()

# Find the index of the maximum similarity for each block in pdf_doc1
max_similarity_indices = np.argmax(similarity_matrix, axis=1)

# Filter out similarities below the threshold
matches = [(i, j) for i, j in enumerate(max_similarity_indices) if similarity_matrix[i, j] >= threshold_similarity]