# Resume Anonymization Script

## Overview
This script is designed to **anonymize Personally Identifiable Information (PII)** from resumes. It reads in a CSV file containing resumes, detects sensitive information, and replaces it with anonymized placeholders to protect candidates' privacy.

## Features
- **Reads resume data from a CSV file**.
- **Identifies PII elements** such as names, emails, phone numbers, and addresses.
- **Replaces PII with generic placeholders** to ensure confidentiality.
- **Exports the anonymized data** into a new CSV file for further processing or sharing.

## Why Anonymization?
Protecting PII is crucial in resume processing, especially when handling large datasets for **machine learning models, recruitment analysis, or compliance with data privacy regulations (e.g., GDPR, CCPA)**.

## Installation & Setup
1. **Clone the Repository**:
   ```bash
   git clone https://github.com/hantayc/mirra_matcher.git
   cd mirra_matcher

2. ** Set up a Virtual Env (if needed) **
   ```bash
   python -m venv mirra_env
   source mirra_env/bin/activate  # Windows: mirra_env\Scripts\Activate


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install presidio-analyzer presidio-anonymizer

In [None]:
import presidio_analyzer
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

In [9]:
!pip show presidio-anonymizer

Name: presidio_anonymizer
Version: 2.2.357
Summary: Presidio Anonymizer package - replaces analyzed text with desired values.
Home-page: https://github.com/Microsoft/presidio
Author: Presidio
Author-email: presidio@microsoft.com
License: MIT
Location: /usr/local/lib/python3.11/dist-packages
Requires: azure-core, pycryptodome
Required-by: 


In [None]:
import re
import pandas as pd
import nltk
from nltk.corpus import words
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig

# download word list
nltk.download('words')
english_words = set(words.words())

# initialize Presidio components
analyzer = AnalyzerEngine()
anonymizer = AnonymizerEngine()

def normalize_spaced_text(text):
    """
    Handles cases where letters are spaced out like "L o w a s d"
    Returns both normalized and original versions for comparison
    """
    if pd.isna(text):
        return "", ""

    # remove spaces between single letters that likely form a word
    normalized = re.sub(r'(?<=\b\w)\s+(?=\w\b)', '', text)
    # handle cases where there might be multiple words
    normalized = re.sub(r'(?<=\b\w)\s+(?=\w)(?=.*?\b)', '', normalized)

    return normalized, text

def clean_text_spacing(text):
    """Enhanced version that handles various edge cases"""
    if pd.isna(text):
        return text

    # first normalize any spaced out text
    normalized, original = normalize_spaced_text(text)
    text = normalized if normalized else original

    # basic cleaning
    text = text.strip()

    # handle various phone number formats
    phone_patterns = [
        r'\b\d{3}\s*-\s*\d{3,4}\b',  # partial numbers like "443 - 4532"
        r'\b\d{3}\s*\d{3,4}\b',      # numbers without separator
        r'\b\d{3}[\s-]+\d{3}[\s-]+\d{4}\b',  # standard format with flexible spacing
        r'\(\s*\d{3}\s*\)\s*\d{3}\s*-?\s*\d{4}\b'  # (123) 456-7890 format
    ]

    for pattern in phone_patterns:
        text = re.sub(pattern, '[REDACTED PHONE]', text)

    # handle names with unusual spacing or punctuation
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # Fix camelCase
    text = re.sub(r'[\,\.\;\:\|\/\\](?=\S)', r'\g<0> ', text)  # Add space after punctuation
    text = re.sub(r'(?<=\S)[\,\.\;\:\|\/\\]', r' \g<0>', text)  # Add space before punctuation

    # handle numbers stuck to text
    text = re.sub(r'(\D)(\d)', r'\1 \2', text)
    text = re.sub(r'(\d)(\D)', r'\1 \2', text)

    # normalize multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()

    # handle social media handles with various formats
    text = re.sub(r'@[\w\.\-_]+', '[REDACTED HANDLE]', text)

    return text

def anonymize_first_words(text, num_words=3):
    """Anonymizes the first `num_words` words if they are not real English words."""
    if pd.isna(text):
        return text

    words_list = text.split()

    # first normalize any spaced out words
    normalized_text = ""
    current_position = 0

    for i in range(min(num_words, len(words_list))):
        word = words_list[i]
        normalized_word, _ = normalize_spaced_text(word)

        if normalized_word and normalized_word.lower() not in english_words:
            words_list[i] = "[REDACTED NAME]"

    return " ".join(words_list)

def preprocess_text(text):
    """Cleans text by removing BOM, fixing spaces, and ensuring name recognition."""
    if pd.isna(text):
        return text

    text = text.replace("\ufeff", "").strip()

    # normalize spaces
    text = re.sub(r'\s+', ' ', text)

    return text

def anonymize_resume(text):
    """Enhanced anonymization with better edge case handling"""
    if pd.isna(text):
        return text

    try:
        # initial cleaning with enhanced spacing fixes
        text = clean_text_spacing(text)

        # check for and handle spaced out text that might be names
        words = text.split()
        potential_spaced_name = ' '.join(words[:3])  # Look at first three words
        normalized_name, _ = normalize_spaced_text(potential_spaced_name)

        if normalized_name and normalized_name.lower() not in english_words:
            text = text.replace(potential_spaced_name, '[REDACTED NAME]', 1)

        # enhanced regex patterns for PII detection
        patterns = {
            'phone': [
                r'\b\d{3}[\s-]*\d{3,4}\b',  # partial numbers
                r'\b\d{3}[\s-]*\d{3}[\s-]*\d{4}\b',  # full numbers
                r'\(\s*\d{3}\s*\)[\s-]*\d{3}[\s-]*\d{4}\b'  # (123) 456-7890 format
            ],
            'address': [
                r'\b\d{1,5}\s+[A-Za-z\s]+(?:street|st|avenue|ave|road|rd|boulevard|blvd|lane|ln|drive|dr)\b',
                r'\b\d{1,5}\s[A-Za-z]+\s[A-Za-z]+\b'
            ],
            'email': [
                r'\b[\w\.-]+@[\w\.-]+\.\w+\b',
                r'\b[\w\.-]+\s*@\s*[\w\.-]+\s*\.\s*\w+\b'  # handles spaced email addresses
            ]
        }

        # apply all patterns
        for category, pattern_list in patterns.items():
            for pattern in pattern_list:
                replacement = f'[REDACTED {category.upper()}]'
                text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

        # proceed with Presidio analysis
        analyzer_results = analyzer.analyze(
            text=text,
            entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "URL", "LOCATION"],
            language="en"
        )

        # validate results
        text_length = len(text)
        valid_results = [
            result for result in analyzer_results
            if 0 <= result.start < text_length and 0 < result.end <= text_length
            and result.start < result.end
        ]

        # apply anonymization
        if valid_results:
            operators = {
                "DEFAULT": OperatorConfig("replace", {"new_value": "[REDACTED]"}),
                "PHONE_NUMBER": OperatorConfig("replace", {"new_value": "[REDACTED PHONE]"}),
                "EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "[REDACTED EMAIL]"}),
                "PERSON": OperatorConfig("replace", {"new_value": "[REDACTED NAME]"}),
                "URL": OperatorConfig("replace", {"new_value": "[REDACTED LINK]"}),
                "LOCATION": OperatorConfig("replace", {"new_value": "[REDACTED LOCATION]"})
            }

            anonymized_result = anonymizer.anonymize(
                text=text,
                analyzer_results=valid_results,
                operators=operators
            )
            return anonymized_result.text

        return text

    except Exception as e:
        print(f"Error anonymizing text (length {len(text)}): {str(e)}")
        return text

def test_anonymization(text):
    """Helper function to test anonymization on specific cases"""
    print("Original:", text)
    cleaned = clean_text_spacing(text)
    print("Cleaned:", cleaned)
    anonymized = anonymize_resume(text)
    print("Anonymized:", anonymized)
    print()

def main():
    try:
        # test edge cases first
        print("Testing edge cases...")
        test_cases = [
            "L o w a s d R i l e m, 443 - 4532",
            "j.o.h.n.d.o.e@e.m.a.i.l.com",
            "J o h n D o e, Software Engineer"
        ]

        for case in test_cases:
            test_anonymization(case)

        # load Resume Data
        print("\nLoading resume data...")
        df = pd.read_csv('1200 Resumes 2024.csv')

        # 1. apply initial preprocessing
        print("Applying initial preprocessing...")
        df["resume_clean"] = df["resume_clean"].apply(preprocess_text)

        # 2. apply Cleaning Space Function
        print("Cleaning text spacing...")
        df["cleaned_resume"] = df["resume_clean"].apply(clean_text_spacing)

        # 3. apply Anonymization Function
        print("Anonymizing resumes...")
        df["anonymized_resume"] = df["cleaned_resume"].apply(anonymize_resume)

        # 4. save Anonymized Data
        print("Saving anonymized data...")
        df.to_csv('1200 Resumes 2024 Anonymized.csv', index=False)

        print("Processing completed successfully!")

    except Exception as e:
        print(f"An error occurred during processing: {str(e)}")

if __name__ == "__main__":
    main()