In [1]:
import os
from pathlib import Path
import re
import string
import unicodedata
import math
from nltk import ngrams
from statistics import mode
import pandas as pd
import pytesseract
from PIL import Image
import pypdfium2

  machar = _get_machar(dtype)


In [2]:
# --- 1. Normalization Functions ---
def remove_html_tags(text: str) -> str:
    """Removes HTML markup (e.g., <p>, <a> tags) that might be in the text."""
    # Pattern to match anything between < and > non-greedily
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def remove_whitespace(text: str) -> str:
    """
    Removes leading/trailing spaces and replaces multiple internal spaces, 
    tabs, or newlines with a single space.
    """
    # Replace all sequences of whitespace characters with a single space
    text = re.sub(r'\s+', ' ', text)
    # Strip any leading or trailing space left over
    return text.strip()

def normalize_accents(text: str) -> str:
    """
    Normalizes Unicode characters by decomposing accented characters into 
    base characters and their diacritics, then removes the diacritics.
    
    Example: 'résumé' -> 'resume'
             'François' -> 'Francois'
             'â' -> 'a'
    """
    # 1. Normalize the string to the 'NFKD' form (Compatibility Decomposition)
    # This separates base characters from diacritical marks.
    normalized = unicodedata.normalize('NFKD', text)
    
    # 2. Encode to ASCII, ignoring errors (this drops the diacritics)
    # Then decode back to UTF-8
    # This specifically removes non-spacing marks that were separated by NFKD.
    return normalized.encode('ascii', 'ignore').decode('utf-8')

# --- 2. Removal Functions ---

def remove_punctuation(text: str) -> str:
    """Removes all standard English punctuation marks."""
    # Creates a translation table mapping every punctuation character to None (removal)
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)
    return text.replace("\n"," ").replace("\r"," ").replace("\'","'").strip()

def remove_digits(text: str) -> str:
    """Removes all numerical digits (0-9) from the text."""
    # \d+ matches one or more digits
    return re.sub(r'\d+', '', text)

def remove_stopwords(text: str) -> str:
    """
    Removes common English stop words. 
    
    NOTE: For production NLP, use a library like NLTK or spaCy for a more 
    comprehensive and efficient stop word list.
    """
    # Simple, small list of very common stop words
    stop_words = set([
        "a", "an", "the", "is", "are", "and", "or", "to", "of", "in", 
        "for", "on", "with", "it", "that", "this", "but", "by"
    ])
    
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)


# Clean OCR Function
def clean_ocr_text(text: str) -> str:
    text = normalize_accents(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)
    return text


In [3]:
import numpy as np
from collections import Counter
from typing import Dict, List

def get_char_vector(word: str) -> Dict[str, int]:
    """
    Creates a character frequency vector (Bag-of-Characters) for a given word.
    Example: 'apple' -> {'a': 1, 'p': 2, 'l': 1, 'e': 1}
    """
    return Counter(word.lower())

def cosine_similarity(word1: str, word2: str) -> float:
    """
    Calculates the cosine similarity between two words based on their 
    Bag-of-Characters (character frequency vectors).
    
    The cosine similarity measures the cosine of the angle between two 
    non-zero vectors, which is a common metric for similarity.
    
    Args:
        word1: The first word (string).
        word2: The second word (string).
        
    Returns:
        A float representing the cosine similarity, ranging from 0.0 (no similarity) 
        to 1.0 (identical words/vectors).
    """
    
    # 1. Handle edge cases (empty strings)
    if not word1 or not word2:
        return 0.0
    
    # 2. Create frequency vectors (Bag-of-Characters)
    vec1 = get_char_vector(word1)
    vec2 = get_char_vector(word2)
    
    # 3. Determine the set of all unique characters across both words
    all_chars = set(vec1.keys()) | set(vec2.keys())
    
    # 4. Create the full numerical vectors (lists) ordered by all_chars
    #    This ensures both vectors are aligned in the same dimension space.
    
    # Dot product components
    numerator = 0
    
    # Squared sum components (for the denominator magnitude calculation)
    sum_sq1 = 0
    sum_sq2 = 0
    
    for char in all_chars:
        count1 = vec1.get(char, 0)
        count2 = vec2.get(char, 0)
        
        # Calculate Dot Product (A * B)
        numerator += count1 * count2
        
        # Calculate Magnitude Squared (|A|^2 and |B|^2)
        sum_sq1 += count1 ** 2
        sum_sq2 += count2 ** 2
        
    # 5. Calculate Magnitudes (|A| and |B|)
    # We use np.sqrt, but math.sqrt works too if you prefer not to use numpy
    # If the function must strictly avoid numpy, use math.sqrt:
    magnitude1 = math.sqrt(sum_sq1)
    magnitude2 = math.sqrt(sum_sq2)
    
    # 6. Calculate Cosine Similarity
    
    # Check for zero magnitude (shouldn't happen if edge cases handled, but safe practice)
    denominator = magnitude1 * magnitude2
    if denominator == 0:
        return 0.0
    
    return numerator / denominator


In [30]:
############################################
# AUTO KEYWORDS
############################################
auto_words = ["car", "vehicle", "automobile",
              "vin","mvr",
              "collision","comprehensive",
              "commute","odometer",
              "garaging","make",
              "model","uninsured"]

auto_bigrams = ["vehicle identification",
                "anti-theft device",
                "uninsured motorist",
                "accident forgiveness",
                "driver safety",
                "good student",
                "driving record",
                "annual mileage",
                "assigned risk",
                "stated amount"]

auto_trigrams = ["vehicle identification number",
                 "uninsured motorist coverage",
                 "safe driver discount",
                 "financial responsibility proof",
                 "defensive driving course"]

############################################
# PROPERTY KEYWORDS
############################################
prop_words = ["dwelling",
              "coinsurance","sqft",
              "masonry","frame",
              "occupancy","hvac",
              "sprinkler","hydrant",
              "flood"]

prop_bigrams = ["replacement cost",
                "square footage",
                "fire district",
                "protection class",
                "roof age",
                "water backup",
                "plumbing updates",
                "foundation type",
                "vandalism coverage",
                "business income",
                "personal property"]

prop_trigrams = ["distance to fire",
                 "business personal property",
                 "masonry veneer construction",
                 "causes of loss",
                 "loss of use"]




########################################################################
# BIGRAMS
personal_auto_bigrams = ["personal auto", "private auto", 
                        "personal car", "private car", 
                        "Primary Driver", "Household Members",
                        "Commute Distance", "Teen Driver",
                        "Personal Use","Multi-Car",
                        "Pleasure Use","Family Vehicle",
                        "Occasional Driver","Good Student",
                        "Student Discount","Uninsured Motorist",
                        "Home Address"]

commercial_auto_bigrams = ["commercial auto", "commercial car",
                          "Federal Tax","Motor Carrier",
                          "Cargo Coverage","Company Name",
                          "DOT Number","Gross Vehicle",
                          "For Hire","Business Operations",
                          "Employee Driver","Fleet Size",
                          "Hazardous Materials","Terminal Address"]



personal_property_bigrams = ["personal property", "private property",
                             "Dwelling Location", "Replacement Cost", "Roof Age", "Fire Protection Class", "Swimming Pool", "Maisonry"
                             ]
commercial_property_bigrams = ["commercial property"]


# TRIGRAMS
personal_auto_trigrams = ["Number of Drivers","High School Diploma",
"Safe Driver Discount","Annual Mileage Driven",
"Driving Record History","Coverage for Rental",
"Primary Garaging Location","Anti-Theft Device",
"Resident of Household"]


commercial_auto_trigrams = ["commercial fleet insurance","Gross Vehicle Weight",
"Interstate Commerce Commission","Business Use Only",
"Operating Radius Limit","Combined Single Limit",
"Certificate of Insurance","Number of Employees",
"Non-Owned Hired Auto","Unified Carrier Registration",
"Products Completed Operations"]

personal_property_trigrams = ["personal property", "private property"]
commercial_property_trigrams = ["commercial property"]


In [31]:
def load_pdf(file_path: str) -> bytes:
    """
    Loads a PDF file from a local repository into a bytes object.

    Args:
        file_path: The path to the PDF file.

    Returns:
        The content of the PDF file as bytes.
    
    Raises:
        FileNotFoundError: If the specified file does not exist.
    """
    path = Path(file_path)
    if not path.is_file():
        raise FileNotFoundError(f"Error: The file was not found at {file_path}")

    # print(f"Loading file: {path.name} ({os.path.getsize(path)} bytes)")
    
    with open(path, 'rb') as f:
        pdf_bytes = f.read()
    
    return pdf_bytes


def semantic_search(target_list:list, text_list:list):
    temp = []
    for target_label in target_list:
        for txt in text_list:
            threshold = cosine_similarity(target_label, txt)
            if threshold > 0.95: 
                temp.append(txt)

        if len(temp) > 0: return True
    
    return False

In [32]:
def quote_classify(pdf_bytes: bytes):
    """
    Processes the PDF bytes using pypdfium2 to extract text from the first page.

    Args:
        pdf_bytes: The PDF content as bytes.
    """
    # print("Processing PDF content with pypdfium2...")

    # 1. Load the PDF document directly from the bytes object
    # pypdfium2's PdfDocument.open() handles byte streams naturally.
    try:
        pdf_document = pypdfium2.PdfDocument(pdf_bytes)
    except Exception as e:
        print(f"Failed to open PDF from bytes. Ensure the file is a valid PDF. Error: {e}")
        return

    num_pages = len(pdf_document)
    # print(f"Document successfully loaded. Total pages: {num_pages}")

    if num_pages == 0:
        print("Document is empty.")
        return

    # 2. Access the first page
    page_index = 0
    page = pdf_document.get_page(page_index)

    form_type = set()
    claim_type = set()
    claim_cat = set()

    # 3. Use the PDF Text Page object for Logo detection
    for i in [70, 80, 90, 100, 150, 200]:
        bitmap = page.render(scale=300/i)
        image = bitmap.to_pil()

        # Extract Text using Tesseract
        text1 = pytesseract.image_to_string(image)

        # Clean Text
        text2 = clean_ocr_text(text1.lower())
        clean_text = text2.split()

        # Create bi/tri-grams
        two_word_phrases = [' '.join(pair) for pair in ngrams(clean_text, 2)]
        three_word_phrases = [' '.join(trio) for trio in ngrams(clean_text, 3)]

        # Classify auto vs prop
        if  ("auto" not in claim_cat) and \
            (semantic_search(auto_words, clean_text) or \
             semantic_search(auto_bigrams, two_word_phrases) or\
             semantic_search(auto_trigrams, three_word_phrases)):
            claim_cat.add("auto")

        if  ("prop" not in claim_cat) and \
            (semantic_search(prop_words, clean_text) or \
             semantic_search(prop_bigrams, two_word_phrases) or\
             semantic_search(prop_trigrams, three_word_phrases)):
            claim_cat.add("prop")





        # if ("personal auto" not in claim_type) and (semantic_search(personal_auto_bigrams, two_word_phrases)):
        #     claim_type.add("personal auto")
        # elif ("personal auto" not in claim_type) and (semantic_search(commercial_auto_bigrams, two_word_phrases) or semantic_search(commercial_auto_trigrams, three_word_phrases)):
        #     claim_type.add("commercial auto")
        # elif ("personal auto" not in claim_type) and (semantic_search(personal_property_bigrams, two_word_phrases)):
        #     claim_type.add("personal property")
        # elif ("personal auto" not in claim_type) and (semantic_search(commercial_property_bigrams, two_word_phrases)):
        #     claim_type.add("commercial property")

    return claim_cat


        # text_page = page.get_textpage()
        
        # # Extract all text from the page
        # text2 = text_page.get_text_range()

        # # 4. Print the extracted text
        # print("-" * 50)
        # print(f"Extracted Text from Page {page_index + 1}:")
        # print(text2.strip()[:500] + ('...' if len(text2) > 500 else '')) # Print first 500 chars
        # print("-" * 50)
    # print(f"Claim Type is: {claim_type}")

In [34]:
# --- Main Execution ---
root_path = Path("../sample_forms/")

if not root_path.is_dir():
    print(f"Error: Root directory not found")
    exit(1)

print(f"Starting recursive search in: {root_path.resolve()}")

# Use glob with '**/*.pdf' for recursive search for all files ending in .pdf
pdf_files = list(root_path.glob('**/*.pdf'))

if not pdf_files:
    print("No PDF files found.")
    exit(0)

print(f"Found {len(pdf_files)} PDF file(s).")
pdf_paths = list(map(str, pdf_files))
file_classify = pd.DataFrame(pdf_paths, columns=["File_Path"])
file_classify["Claim_Type"] = None
file_classify["Form_Type"] = None

for i in range(len(pdf_paths)):
    print(f"\n--- File {i+1}/{len(pdf_paths)}: {pdf_paths[i]} ---")
    
    try:
        # Load the entire file content into a bytes object using pathlib's method
        pdf_bytes = load_pdf(pdf_paths[i])
        
        # Process the loaded bytes
        # print(process_pdf(pdf_bytes))
        val = quote_classify(pdf_bytes)
        print(val)
        # file_classify.at[i,"Claim_Category"] = val

    except PermissionError:
        print(f"    [SKIP] Permission denied when accessing {pdf_paths[i]}")
    except Exception as e:
        print(f"    [ERROR] An unexpected error occurred while reading {pdf_paths[i]}: {e}")


Starting recursive search in: /home/aamir79/projects/quote-management-system/sample_forms
Found 32 PDF file(s).

--- File 1/32: ../sample_forms/auto/Aviva-private-car-insurance-proposal-form.pdf ---
{'auto'}

--- File 2/32: ../sample_forms/auto/CSIO-Commercial-Fleet-application-form.pdf ---
set()

--- File 3/32: ../sample_forms/auto/Maritime-Motor-Insurance-Quotation-Form.pdf ---
{'auto'}

--- File 4/32: ../sample_forms/auto_commercial/ACORD-Business-Auto-Section-127.pdf ---
{'auto'}

--- File 5/32: ../sample_forms/auto_commercial/Acord125 Commercial App.pdf ---
{'auto'}

--- File 6/32: ../sample_forms/auto_commercial/Commonwealth-commercial-quote_info_sheet.pdf ---
set()

--- File 7/32: ../sample_forms/auto_commercial/Truckers-Quick-Quote-Sheet.pdf ---
{'auto'}

--- File 8/32: ../sample_forms/auto_personal/Acord-71.pdf ---
{'auto'}

--- File 9/32: ../sample_forms/auto_personal/Acord-83-Personal-Umbrella.pdf ---
{'auto', 'prop'}

--- File 10/32: ../sample_forms/auto_personal/Acord-90-C

In [None]:
file_classify

In [None]:
a = pdf_paths[14]
print(a)
b = load_pdf(a)
c = process_pdf(b)
print(c)

In [None]:


# --- Configuration ---
# IMPORTANT: You must set the path to the Tesseract executable 
# if it is not automatically found by pytesseract.
# 
# Example for Windows:
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

def ocr_scanned_pdf(pdf_path: str) -> str:
    """
    Performs OCR on a scanned PDF file and returns the extracted text.

    Args:
        pdf_path: The file path to the scanned PDF.

    Returns:
        A string containing all text extracted from the PDF, 
        or an error message if the file cannot be processed.
    """
    if not os.path.exists(pdf_path):
        return f"Error: PDF file not found at {pdf_path}"

    try:
        # Load the PDF file using pypdfium2
        pdf_document = pdfium.PdfDocument(pdf_path)
        num_pages = len(pdf_document)
        full_text = []

        print(f"Starting OCR on {num_pages} pages...")

        for i in range(num_pages):
            page = pdf_document.get_page(i)
            
            # Render the page to a bitmap (image)
            # Scale factor 2 provides good resolution for OCR
            bitmap = page.render(scale=2)
            
            # Convert the bitmap to a PIL Image object
            image = bitmap.to_pil()
            
            # --- OCR Processing ---
            # Use pytesseract to extract text from the image
            text = pytesseract.image_to_string(image)
            
            print(f"--- Page {i+1} OCR Completed ---")
            full_text.append(text)

        return "\n".join(full_text)

    except pytesseract.TesseractNotFoundError:
        return "Error: Tesseract is not installed or not in your PATH. Please install it or set 'pytesseract.pytesseract.tesseract_cmd'."
    except Exception as e:
        return f"An error occurred during processing: {e}"

# --- Example Usage ---
if __name__ == "__main__":
    # Replace 'path/to/your/scanned_document.pdf' with your actual file path
    pdf_file = 'path/to/your/scanned_document.pdf' 
    
    # --- IMPORTANT: Ensure you have a scanned PDF file here for testing ---
    # For demonstration, we'll use a placeholder path:
    # If you want to test this, create a simple PDF with a picture of text.
    
    # Example using a dummy file path (update this):
    dummy_pdf_file = "sample_scanned_document.pdf" 
    
    # Assuming 'sample_scanned_document.pdf' exists and is a scanned image PDF
    if os.path.exists(dummy_pdf_file):
        extracted_text = ocr_scanned_pdf(dummy_pdf_file)
        
        print("\n====================================")
        print("         EXTRACTED TEXT")
        print("====================================\n")
        print(extracted_text)
    else:
        print(f"Please replace '{dummy_pdf_file}' with the path to an existing scanned PDF to run the example.")