In [1]:
!pip install -U pypdfium2
!pip install matplotlib
!pip install opencv-python
!pip install pytesseract
!pip install tqdm
!pip install spacy
!pip install nltk
!pip install paddleocr
!pip install paddlepaddle




In [3]:
import pypdfium2 as pdfium
import matplotlib.pyplot as plt
from PIL import Image
from io import BytesIO
from pytesseract import image_to_string
import re

In [4]:
def convert_pdf_to_images(file_path, scale=300/72):

    pdf_file = pdfium.PdfDocument(file_path)
    page_indices = [i for i in range(len(pdf_file))]

    renderer = pdf_file.render(
        pdfium.PdfBitmap.to_pil,
        page_indices = page_indices,
        scale = scale,
    )

    list_final_images = []

    for i, image in zip(page_indices, renderer):

        image_byte_array = BytesIO()
        image.save(image_byte_array, format='jpeg', optimize=True)
        image_byte_array = image_byte_array.getvalue()
        list_final_images.append(dict({i:image_byte_array}))

    return list_final_images

In [5]:
def display_images(list_dict_final_images):

    all_images = [list(data.values())[0] for data in list_dict_final_images]

    for index, image_bytes in enumerate(all_images):

        image = Image.open(BytesIO(image_bytes))
        figure = plt.figure(figsize = (image.width / 100, image.height / 100))

        plt.title(f"----- Page Number {index+1} -----")
        plt.imshow(image)
        plt.axis("off")
        plt.show()

In [6]:
# display_images(list_of_images)

In [7]:
def extract_text_with_pytesseract(list_dict_final_images):

    image_list = [list(data.values())[0] for data in list_dict_final_images]
    image_content = []

    for index, image_bytes in enumerate(image_list):

        image = Image.open(BytesIO(image_bytes))
        raw_text = str(image_to_string(image))
        image_content.append(raw_text)

    return "\n".join(image_content)

In [9]:
def ocr(pdf_path):
    images = convert_pdf_to_images(pdf_path)
    text = extract_text_with_pytesseract(images)
    return text

In [51]:
text_org = ocr('MoU_AI_Cardamom.pdf')
text_mod = ocr('MoU_AI_Cardamomv2.pdf')

In [38]:
text_org = '''The Wonders of Nature

Nature has always been a source of inspiration for humankind. The vast forests, the mighty oceans, and the towering mountains have captivated the imagination of poets, artists, and explorers alike. Every element in nature serves a purpose and contributes to the delicate balance of our ecosystem.

The rustling of leaves in the wind is like a soothing lullaby that can calm the most troubled mind. Many people find solace in nature, and it has been proven to have healing effects on the human spirit.

In the heart of the forest, where sunlight barely reaches the ground, life thrives in a world of its own. The forest floor, covered in moss and fallen leaves, is a testament to the cycle of life and death. Nature teaches us the importance of resilience and adaptation.

However, the rapid pace of human development poses a threat to these natural wonders. Deforestation, pollution, and climate change are some of the challenges that our planet faces. It is crucial for us to take immediate action to preserve the beauty and vitality of our natural world for future generations.

'''

In [48]:
text_mod = '''
The Wonders of Nature

Nature has always been a source of inspiration for humankind. The vast forests, the mighty oceans, and the towering mountains have captivated the imagination of poets, artists, and explorers alike. Every element in nature serves a purpose and contributes to the delicate balance of our ecosystem.

Many people find solace in nature, and it has been proven to have healing effects on the human spirit. I love spending time with nature.

In the dense heart of the forest, where sunlight barely penetrates, life thrives in a world of its own. The forest floor, covered in moss and fallen leaves, is a testament to the cycle of life and death. Nature teaches us the importance of resilience and adaptation.

Yet, the rapid pace of human development poses a serious threat to these natural wonders. Deforestation, pollution, and climate change are just a few of the challenges that our planet faces. It is crucial for us to take immediate and effective action to preserve the beauty and vitality of our natural world for future generations.
'''

In [42]:
import difflib
from nltk import sent_tokenize
def split_into_sentences(text):
     # Define a regex pattern to match the tokens you want to remove
    pattern = r'(?<!\S)(\d+(\.\d+)*\.\s*|I[VX]?[I]{0,3}\.\s*)'
    # Remove the tokens from the text
    text = re.sub(pattern, '', text)
    # Strip the text and remove all extra white spaces
    cleaned_text = ' '.join(text.split())
    
    # Tokenize the cleaned text into sentences
    sentences = sent_tokenize(cleaned_text)
    return sentences
    
def compare_sentences(text1, text2, lower_threshold=0.5, upper_threshold=0.98):
    """
    Compares sentences from two texts and returns a list of results indicating whether they match,
    are modified, added, or deleted.
    Uses similarity thresholds to determine matches and modifications.
    """
    sentences1 = split_into_sentences(text1)
    sentences2 = split_into_sentences(text2)

    results = []
    matched_indices1 = set()
    matched_indices2 = set()

    # Compare each sentence in text2 against all sentences in text1
    for idx2, sent2 in enumerate(sentences2):
        best_match = None
        best_similarity = 0.0
        best_idx1 = None

        for idx1, sent1 in enumerate(sentences1):
            if idx1 in matched_indices1:
                continue

            similarity = difflib.SequenceMatcher(None, sent1, sent2).ratio()

            if similarity > best_similarity:
                best_similarity = similarity
                best_match = (idx2, "Match" if similarity >= upper_threshold else "Modified", similarity, sent1, sent2)
                best_idx1 = idx1

        if best_similarity >= lower_threshold:
            results.append(best_match)
            matched_indices1.add(best_idx1)
            matched_indices2.add(idx2)
        else:
            results.append((idx2, "Added", 0.0, None, sent2))
            matched_indices2.add(idx2)

    # Check for any sentences in text1 not found in text2
    for idx1, sent1 in enumerate(sentences1):
        if idx1 not in matched_indices1:
            results.append((None, "Deleted", 0.0, sent1, None))

    return results

results = compare_sentences(text_org, text_mod)

for index, result, similarity, sent1, sent2 in results:
    print(f"Sentence {index + 1 if index is not None else 'N/A'}: {result} (Similarity: {similarity:.2f})")
    print(f"Original: {sent1}")
    print(f"Modified: {sent2}")
    print("-" * 50)


results = compare_sentences(text_org, text_mod)

for index, result, similarity, sent1, sent2 in results:
    print(f"Sentence {index + 1 if index is not None else 'N/A'}: {result} (Similarity: {similarity:.2f})")
    print(f"Original: {sent1}")
    print(f"Modified: {sent2}")
    print("-" * 50)

Sentence 1: Match (Similarity: 1.00)
Original: The Wonders of Nature Nature has always been a source of inspiration for humankind.
Modified: The Wonders of Nature Nature has always been a source of inspiration for humankind.
--------------------------------------------------
Sentence 2: Match (Similarity: 1.00)
Original: The vast forests, the mighty oceans, and the towering mountains have captivated the imagination of poets, artists, and explorers alike.
Modified: The vast forests, the mighty oceans, and the towering mountains have captivated the imagination of poets, artists, and explorers alike.
--------------------------------------------------
Sentence 3: Match (Similarity: 1.00)
Original: Every element in nature serves a purpose and contributes to the delicate balance of our ecosystem.
Modified: Every element in nature serves a purpose and contributes to the delicate balance of our ecosystem.
--------------------------------------------------
Sentence 4: Match (Similarity: 1.00)
O

In [15]:
!pip install reportlab

Collecting reportlab
  Downloading reportlab-4.2.2-py3-none-any.whl.metadata (1.4 kB)
Downloading reportlab-4.2.2-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.2.2


In [22]:
from reportlab.lib.pagesizes import letter
from reportlab.lib import colors
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer

def create_pdf(results, filename="comparison_report.pdf"):
    doc = SimpleDocTemplate(filename, pagesize=letter)
    styles = getSampleStyleSheet()
    story = []
    
    for index, result, similarity, para1, para2 in results:
        if result in ["Modified", "Deleted", "Added"]:
            if result == "Modified":
                story.append(Paragraph(f"<b>Sentence {index + 1 if index is not None else 'N/A'}: {result} (Similarity: {similarity:.2f})</b>", styles["Normal"]))
                story.append(Paragraph(f"<b>Original:</b> {para1}", styles["Normal"]))
                story.append(Paragraph(f"<b>Modified:</b> {para2}", styles["Normal"]))
            elif result == "Deleted":
                story.append(Paragraph(f"<b>Sentence {index + 1 if index is not None else 'N/A'}: {result}</b>", styles["Normal"]))
                story.append(Paragraph(f"<b>Original:</b> {para1}", styles["Normal"]))
            elif result == "Added":
                story.append(Paragraph(f"<b>Sentence {index + 1 if index is not None else 'N/A'}: {result}</b>", styles["Normal"]))
                story.append(Paragraph(f"<b>Modified:</b> {para2}", styles["Normal"]))
            story.append(Spacer(1, 12))
            story.append(Spacer(1, 12))

    doc.build(story)


In [50]:
create_pdf(results, "comparison_report.pdf")