In [0]:
!python -m spacy download en_core_web_sm

In [0]:
from PIL import Image
import pytesseract

# Extract Text from PDF

In [0]:
# text = pytesseract.image_to_string(Image.open('/Volumes/workspace/default/kiran_data/unstructured-data/Principal-Sample-Life-Insurance-Policy.pdf'))
# print(text)

import pdfplumber

pdf_path = "/Volumes/workspace/default/kiran_data/unstructured-data/Principal-Sample-Life-Insurance-Policy.pdf"

with pdfplumber.open(pdf_path) as pdf:
    full_text = ""
    for page in pdf.pages:
        full_text += page.extract_text() + "\n"

print(full_text[:1000])  # Print first 1000 characters to verify


# Extract Key Information

In [0]:
import re

# Example regex patterns (adjust to actual text)
policy_number = re.search(r'Policy Number[:\s]+(\w+)', full_text)
effective_date = re.search(r'Effective Date[:\s]+([\w\s,]+)', full_text)
coverage_amount = re.search(r'Coverage Amount[:\s]+\$([\d,]+)', full_text)

print("Policy Number:", policy_number.group(1) if policy_number else "Not found")
print("Effective Date:", effective_date.group(1) if effective_date else "Not found")
print("Coverage Amount:", coverage_amount.group(1) if coverage_amount else "Not found")


In [0]:
import spacy

nlp = spacy.load("en_core_web_sm")  # or a fine-tuned model
doc = nlp(full_text)

for ent in doc.ents:
    print(ent.text, ent.label_)


In [0]:
doc

What This PoC Does

Text Extraction: Reads both PDFs.

NLP Extraction: Detects dates, money amounts, numbers, and organizations.

Field Mapping: Converts first detected entity of each type to a structured field.

Comparison: Highlights differences between the two policies.

In [0]:
# Step 1: Install dependencies if not already
# pip install pdfplumber spacy pandas
# python -m spacy download en_core_web_sm

import pdfplumber
import spacy
import pandas as pd

# Step 2: Load spaCy NLP model
nlp = spacy.load("en_core_web_sm")

# Step 3: Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    full_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:
                full_text += page_text + "\n"
    return full_text

# Step 4: Function to extract key entities using NLP
def extract_entities(text):
    doc = nlp(text)
    dates, money, numbers, orgs = [], [], [], []
    for ent in doc.ents:
        if ent.label_ == "DATE":
            dates.append(ent.text)
        elif ent.label_ == "MONEY":
            money.append(ent.text)
        elif ent.label_ == "CARDINAL":
            numbers.append(ent.text)
        elif ent.label_ == "ORG":
            orgs.append(ent.text)
    # Map to structured fields (simple heuristic: first occurrence)
    policy_data = {
        "Policy Number": numbers[0] if numbers else None,
        "Effective Date": dates[0] if dates else None,
        "Coverage Amount": money[0] if money else None,
        "Organization": orgs[0] if orgs else None
    }
    return policy_data

# Step 5: Load your two policies
pdf_1 = "/Volumes/workspace/default/kiran_data/unstructured-data/Principal-Sample-Life-Insurance-Policy.pdf"
pdf_2 = "/Volumes/workspace/default/kiran_data/unstructured-data/public-liability-policy-wording.pdf"

text_1 = extract_text_from_pdf(pdf_1)
text_2 = extract_text_from_pdf(pdf_2)

# Step 6: Extract entities from both PDFs
data_1 = extract_entities(text_1)
data_2 = extract_entities(text_2)

# Step 7: Combine into a DataFrame for comparison
df = pd.DataFrame([data_1, data_2], index=["Policy 1", "Policy 2"])
print("Extracted Policy Data:\n")
print(df)

# Step 8: Detect differences between the two policies
print("\nChanges Detected:")
for col in df.columns:
    if df.loc["Policy 1", col] != df.loc["Policy 2", col]:
        print(f"{col} changed: {df.loc['Policy 1', col]} -> {df.loc['Policy 2', col]}")


In [0]:
import matplotlib.pyplot as plt

# --- Step 1: Create a comparison DataFrame ---
df_compare = df.copy()

# Convert coverage amounts to numeric for plotting
def clean_coverage(value):
    if value:
        return float(value.replace(',', ''))
    return 0

df_compare['Coverage Amount Num'] = df_compare['Coverage Amount'].apply(clean_coverage)

# --- Step 2: Bar chart for Coverage Amount ---
plt.figure(figsize=(8,5))
plt.bar(df_compare.index, df_compare['Coverage Amount Num'], color=['skyblue', 'orange'])
plt.title('Coverage Amount Comparison')
plt.ylabel('Coverage Amount ($)')
plt.xlabel('Policy')
plt.show()

# --- Step 3: Highlight changes in a simple table ---
print("\n--- Policy Comparison Table ---")
for index, row in df_compare.iterrows():
    print(f"\n{index}:")
    print(f"  Organization    : {row['Organization']}")
    print(f"  Effective Date  : {row['Effective Date']}")
    print(f"  Policy Number   : {row['Policy Number']}")
    print(f"  Coverage Amount : {row['Coverage Amount']}")
