In [None]:
import re
import spacy

# Load SpaCy model for name detection
nlp = spacy.load("en_core_web_lg")

def detect_pii_positions(text):
    """Detects names and numeric PII (Aadhar, PAN, Phone)."""

    pii_entities = []
    detected_names = set()

    # Define terms that should NOT be masked
    protected_terms = ["aadhar", "pan", "phone", "number", "name", "is", "my"]

    # Regex patterns for Aadhar, PAN, and Phone numbers with word boundaries
    patterns = {
        "AADHAR": r"\b(\d{4}[-\s]?\d{4}[-\s]?\d{4})\b",  # Matches Aadhar (with or without hyphens)
        "PAN": r"\b([A-Z]{5}[0-9]{4}[A-Z])\b",  # Matches PAN format ABCDE1234F
        "PHONE": r"\b(\d{10})\b"  # Matches 10-digit phone number
    }

    # Detect Names Using SpaCy
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            # Only add if not in protected terms
            if ent.text.lower() not in protected_terms:
                detected_names.add((ent.text, ent.start_char, ent.end_char))

    # Detect Aadhar, PAN, and Phone numbers
    for label, pattern in patterns.items():
        for match in re.finditer(pattern, text):
            value = match.group(1) if len(match.groups()) > 0 else match.group(0)
            pii_entities.append((match.start(), match.end(), value, label))

    return pii_entities, detected_names

def extract_first_names(text):
    """Extract all potential first names from the text."""
    # Find all words that start with a capital letter
    potential_names = set()

    # Find words that might be names (capitalized words)
    for match in re.finditer(r'\b([A-Z][a-z]+)\b', text):
        word = match.group(1)
        if word.lower() not in ["my", "aadhar", "pan"]:
            potential_names.add((word, match.start(), match.end()))

    return potential_names

def mask_pii(text):
    """Masks detected PII while keeping partial visibility."""

    # First detect all PII
    pii_entities, detected_names = detect_pii_positions(text)

    # Also extract potential first names
    first_names = extract_first_names(text)

    # Create lists to store masking information
    masking_info = []
    full_masks = []
    partial_masks = []

    # Create a working copy of the text
    masked_text = text

    # Process names (full masking)
    for name, start, end in detected_names:
        if name.lower() not in ["aadhar", "pan", "phone", "number", "is", "my"]:
            full_masks.append({
                "type": "NAME (SpaCy)",
                "original": name,
                "masked": "[MASKED]",
                "position": (start, end)
            })

    # Process potential first names from capitalized words
    for name, start, end in first_names:
        if name.lower() not in ["aadhar", "pan", "phone", "number", "is", "my"]:
            full_masks.append({
                "type": "NAME (Capitalized)",
                "original": name,
                "masked": "[MASKED]",
                "position": (start, end)
            })

    # Process "Gay" separately
    for match in re.finditer(r'\b(Gay)\b', text, flags=re.IGNORECASE):
        full_masks.append({
            "type": "NAME (Specified)",
            "original": match.group(0),
            "masked": "[MASKED]",
            "position": (match.start(), match.end())
        })

    # Process numeric PII (partial masking)
    for start, end, entity, label in pii_entities:
        if label == "AADHAR":
            masked_value = entity[:2] + "XXXXXXXXXX" + entity[-2:]
            partial_masks.append({
                "type": "AADHAR",
                "original": entity,
                "masked": masked_value,
                "position": (start, end)
            })
        elif label == "PAN":
            masked_value = entity[:3] + "X" + entity[4] + "XXX" + entity[-1]
            partial_masks.append({
                "type": "PAN",
                "original": entity,
                "masked": masked_value,
                "position": (start, end)
            })
        elif label == "PHONE":
            masked_value = entity[:2] + "XXXXXX" + entity[-2:]
            partial_masks.append({
                "type": "PHONE",
                "original": entity,
                "masked": masked_value,
                "position": (start, end)
            })

    # Combine all masking information
    masking_info = full_masks + partial_masks

    # Sort by position to ensure replacements are done right to left
    masking_info.sort(key=lambda x: x["position"][0], reverse=True)

    # Apply all masks
    for info in masking_info:
        start, end = info["position"]
        masked_text = masked_text[:start] + info["masked"] + masked_text[end:]

    return masked_text.strip(), masking_info

# Test Cases
test_cases = [
    "My name is Kiara. My Aadhar is 7349-6507-0013, PAN is EPGTL1234F, and phone number is 9786543201.",
    "My name is Samarth Shinde. My Aadhar is 1234-5678-9012, PAN is ABCDE1234F, and phone number is\n    9876543210.",
    "My name is Swapnil Jadhav. My Aadhar is 5678-1234-9012, PAN is XYZAB5678L, and phone number is 9012345678."
]

for i, user_text in enumerate(test_cases):
    masked_result, masking_info = mask_pii(user_text)
    print(f"\n🔹 Test Case #{i+1}:")
    print(f"Original: {user_text}")
    print(f"Masked  : {masked_result}")

    print("\n🔍 Masking Details:")
    print("Full Masks (Names):")
    for info in masking_info:
        if info["masked"] == "[MASKED]":
            print(f"  - {info['type']}: '{info['original']}' at positions {info['position']} -> '{info['masked']}'")

    print("\nPartial Masks (PII):")
    for info in masking_info:
        if info["masked"] != "[MASKED]":
            print(f"  - {info['type']}: '{info['original']}' at positions {info['position']} -> '{info['masked']}'")