In [0]:
%pip install presidio-analyzer presidio-anonymizer

In [0]:
%restart_python

In [0]:
import logging

logging.getLogger("presidio-analyzer").setLevel(logging.ERROR)

In [0]:
from presidio_analyzer import AnalyzerEngine

analyzer = AnalyzerEngine()
results = analyzer.analyze(text="My number is 0723456789 and my SSN is 123-45-6789", 
                           entities=[], language="en")

for result in results:
    print(result)

In [0]:

from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer

def load_eu_recognizers(analyzer, languages=["es", "fr", "de", "it"]):
    recognizer_definitions = {
        "es": [
            {
                "entity": "SPANISH_DNI",
                "pattern": Pattern("SPANISH_DNI_PATTERN", r"\b\d{8}[A-Z]\b", 0.85),
                "context": ["dni", "documento", "identidad", "ciudadano", "persona"]
            },
            {
                "entity": "SPANISH_NIE",
                "pattern": Pattern("SPANISH_NIE_PATTERN", r"\b[XYZ]\d{7}[A-Z]\b", 0.85),
                "context": ["nie", "extranjero", "residencia", "documento"]
            },
            {
                "entity": "SPANISH_IBAN",
                "pattern": Pattern("SPANISH_IBAN_PATTERN", r"\bES\d{2}\s?\d{4}\s?\d{4}\s?\d{2}\s?\d{10}\b", 0.9),
                "context": ["cuenta", "IBAN", "banco", "transferencia", "pago", "domiciliación"]
            },
            {
                "entity": "SPANISH_VAT_ID",
                "pattern": Pattern("SPANISH_VAT_PATTERN", r"\b[A-Z]\d{8}|\d{8}[A-Z]\b", 0.85),
                "context": ["nif", "cif", "IVA", "factura", "empresa"]
            }
        ],
        "fr": [
            {
                "entity": "FRENCH_INSEE",
                "pattern": Pattern("FRENCH_INSEE_PATTERN", r"\b[12]\d{2}(0[1-9]|1[0-2])\d{2}\d{3}\d{3}\b", 0.85),
                "context": ["insee", "numéro", "sécurité", "sociale", "nss"]
            }
        ],
        "de": [
            {
                "entity": "GERMAN_TAX_ID",
                "pattern": Pattern("GERMAN_TAX_ID_PATTERN", r"\b\d{11}\b", 0.85),
                "context": ["steuer", "id", "identifikationsnummer", "finanzamt", "steuerliche"]
            }
        ],
        "it": [
            {
                "entity": "ITALIAN_TAX_ID",
                "pattern": Pattern("ITALIAN_TAX_ID_PATTERN", r"\b[A-Z]{6}\d{2}[A-Z]\d{2}[A-Z]\d{3}[A-Z]\b", 0.9),
                "context": ["codice", "fiscale", "identificativo", "contribuente"]
            }
        ]
    }

    for lang in languages:
        if lang not in recognizer_definitions:
            print(f"⚠️ No recognizers defined for language: {lang}")
            continue

        for r in recognizer_definitions[lang]:
            recognizer = PatternRecognizer(
                supported_entity=r["entity"],
                supported_language=lang,
                patterns=[r["pattern"]],
                context=r["context"]
            )
            analyzer.registry.add_recognizer(recognizer)

    print(f"✅ EU recognizers loaded for languages: {', '.join(languages)}")


In [0]:
# Install the Spanish language model
!python -m spacy download es_core_news_sm


In [0]:
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import SpacyNlpEngine
import spacy

# Load the Spanish language model
nlp = spacy.load("es_core_news_sm")

# Create a Spacy NLP engine
nlp_engine = SpacyNlpEngine(
    models=[
        {"lang_code": "es", "model_name": "es_core_news_sm"}
    ]
)

# Create an analyzer with the NLP engine
analyzer = AnalyzerEngine(nlp_engine=nlp_engine, 
                          supported_languages=["es", "fr", "de", "it"] 
                          )



# Register recognizers
load_eu_recognizers(analyzer, languages=["es", "fr", "de", "it"])


In [0]:
text = """
El DNI del cliente es 12345678Z y su NIE es X1234567L.
También puedes hacer la transferencia a ES91 2100 0418 4502 0005 1332.
El NIF es B12345678.
"""

results = analyzer.analyze(
    text=text,
    entities=["SPANISH_DNI", "SPANISH_NIE", "SPANISH_IBAN", "SPANISH_VAT_ID"],
    language="es"
)

for result in results:
    print(f"Detected {result.entity_type} at position {result.start}-{result.end}: {text[result.start:result.end]} with score {result.score}")


In [0]:
from presidio_anonymizer import AnonymizerEngine
anonymizer = AnonymizerEngine()

anonymized_result = anonymizer.anonymize(
    text=text,
    analyzer_results=results
)

print(anonymized_result.text)