Set up the env

In [1]:
%cd ..
# %ls

# !poetry env use python3.11
%pip install -e .

c:\Users\l.iorio\development\POC_GOVERNANCE
Obtaining file:///C:/Users/l.iorio/development/POC_GOVERNANCE
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Checking if build backend supports build_editable: started
  Checking if build backend supports build_editable: finished with status 'done'
  Getting requirements to build editable: started
  Getting requirements to build editable: finished with status 'done'
  Preparing editable metadata (pyproject.toml): started
  Preparing editable metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: poc-gdpr
  Building editable for poc-gdpr (pyproject.toml): started
  Building editable for poc-gdpr (pyproject.toml): finished with status 'done'
  Created wheel for poc-gdpr: filename=poc_gdpr-0.1.0-py3-none-any.whl size=1244 sha256=401d924fd4bf4590d36baad360d3ebd068e63f2a2451baf1f9d8d7776ceec0dc
  Stored in directory: C:\Users\l.iorio\AppData\Local\Temp


[notice] A new release of pip is available: 23.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Import modules

In [2]:
from poc_gdpr.src.text_protector import (
    TextProtector, 
    InstanceCounterAnonymizer, 
    InstanceCounterDeanonymizer
)

from presidio_anonymizer.entities import OperatorConfig
from pprint import pprint

First step is **ananymize** sensitive contents

In [3]:
text = """
    Ciao, mi chiamo Mario Rossi e vivo a Roma e ho 38 anni. Il mio numero di telefono è 1234567890 e la mia email è
    mariorossi@gmail.com, mentre il mio codice fiscale è RSSMRA85M01H501Z.
"""

text_protector = TextProtector()
text_protector.anonymizer.add_anonymizer(InstanceCounterAnonymizer)
text_protector.deanonymizer.add_deanonymizer(InstanceCounterDeanonymizer)

results = text_protector.analyze_text(text)

# Create a mapping between entity types and counters
entity_mapping = dict()

anonymized_result = text_protector.anonymizer.anonymize(
text,
results,
{
    "DEFAULT": OperatorConfig(
        "entity_counter", {"entity_mapping": entity_mapping}
    )
},
)



deanonymized = text_protector.deanonymizer.deanonymize(
    anonymized_result.text, 
    anonymized_result.items, 
    {"DEFAULT": OperatorConfig("entity_counter_deanonymizer", 
                            params={"entity_mapping": entity_mapping})}
)
# print("anonymized text:")
# pprint(anonymized_result.text)
# print("de-anonymized text:")
# pprint(deanonymized.text)
print("anonymized entities:")
pprint(entity_mapping, indent=2)


[32m2025-05-18 17:29:58[0m [35mDAT32140[0m [34mpoc_gdpr.config.logger[32240][0m [1;30mINFO[0m languages-config.yml already exists and is up to date.
[32m2025-05-18 17:30:00[0m [35mDAT32140[0m [34mpoc_gdpr.config.logger[32240][0m [1;30mINFO[0m Italian spaCy model is already downloaded.
[32m2025-05-18 17:30:02[0m [35mDAT32140[0m [34mpresidio-analyzer[32240][0m [1;30mINFO[0m Created NLP engine: spacy. Loaded models: ['it']
[32m2025-05-18 17:30:02[0m [35mDAT32140[0m [34mpresidio-analyzer[32240][0m [1;30mINFO[0m registry not provided, creating default.
[32m2025-05-18 17:30:02[0m [35mDAT32140[0m [34mpresidio-analyzer[32240][0m [1;30mINFO[0m Loaded recognizer: CreditCardRecognizer
[32m2025-05-18 17:30:02[0m [35mDAT32140[0m [34mpresidio-analyzer[32240][0m [1;30mINFO[0m Loaded recognizer: CreditCardRecognizer
[32m2025-05-18 17:30:02[0m [35mDAT32140[0m [34mpresidio-analyzer[32240][0m [1;30mINFO[0m Loaded recognizer: CreditCardRecognizer
[3

anonymized entities:
{ 'EMAIL_ADDRESS': {'mariorossi@gmail.com': '<EMAIL_ADDRESS_0>'},
  'IT_FISCAL_CODE': {'RSSMRA85M01H501Z': '<IT_FISCAL_CODE_0>'},
  'LOCATION': {'Roma': '<LOCATION_0>'},
  'PERSON': {'Mario Rossi': '<PERSON_0>'},
  'PHONE_NUMBER': {'1234567890': '<PHONE_NUMBER_0>'}}


Print the anonymized results

In [4]:
presidio_anonymizer_res = anonymized_result.text
pprint(presidio_anonymizer_res, indent=2)

('\n'
 '    Ciao, mi chiamo <PERSON_0> e vivo a <LOCATION_0> e ho 38 anni. Il mio '
 'numero di telefono è <PHONE_NUMBER_0> e la mia email è\n'
 '    <EMAIL_ADDRESS_0>, mentre il mio codice fiscale è <IT_FISCAL_CODE_0>.\n')


Set up validation classes in pydantic, for data quality checks.

First class is the base pydantic model with attributes

Second class is a more complex pydantic model, that inherits from the first one, but has field validators check in order to control several attributes

In [5]:
from pydantic import BaseModel, Field, field_validator
import re

class UserInfo(BaseModel):
    name: str = Field(..., description="Nome completo della persona")
    city: str = Field(..., description="Città di residenza")
    phone_number: str = Field(..., description="Numero di telefono")
    email: str = Field(..., description="Indirizzo email")
    fiscal_code: str = Field(..., description="Codice fiscale")

class UserInfoValidator(UserInfo):
    @field_validator("name")
    def validate_name(cls, v):
        if not re.match(r"^[A-Z][a-z]+ [A-Z][a-z]+$", v):
            raise ValueError("Il nome deve essere in formato 'Nome Cognome'")
        return v

    @field_validator("phone_number")
    def validate_phone_number(cls, v):
        if not re.match(r"^\d{10}$", v):
            raise ValueError("Il numero di telefono deve essere lungo 10 cifre")
        return v

    @field_validator("email")
    def validate_email(cls, v):
        if not re.match(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$", v):
            raise ValueError("L'email non è valida")
        return v

Call the agents with anonymized PIIs

In [None]:
from poc_gdpr.src.core import triage_agent
from agents import Runner

result = await Runner.run(triage_agent, presidio_anonymizer_res)
print(result.final_output)

[32m2025-05-18 17:30:21[0m [35mDAT32140[0m [34mhttpx[32240][0m [1;30mINFO[0m HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"
[32m2025-05-18 17:30:22[0m [35mDAT32140[0m [34mhttpx[32240][0m [1;30mINFO[0m HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK"


```json
{
    "name": "<PERSON_0>",
    "city": "<LOCATION_0>",
    "phone_number": "<PHONE_NUMBER_0>",
    "email": "<EMAIL_ADDRESS_0>",
    "fiscal_code": "<IT_FISCAL_CODE_0>"
}
```


[32m2025-05-18 17:30:23[0m [35mDAT32140[0m [34mhttpx[32240][0m [1;30mINFO[0m HTTP Request: POST https://api.openai.com/v1/traces/ingest "HTTP/1.1 204 No Content"
[32m2025-05-18 17:30:24[0m [35mDAT32140[0m [34mhttpx[32240][0m [1;30mINFO[0m HTTP Request: POST https://api.openai.com/v1/traces/ingest "HTTP/1.1 204 No Content"


Validate the output and de-anonymize PIIs

In [7]:
from poc_gdpr.src.text_protector import JsonValidator

json_validator = JsonValidator(
    pydantic_model_not_validated=UserInfo,
    pydantic_model_validated=UserInfoValidator,
    raw_json=result.final_output,
    entity_mapping=entity_mapping,
)

json_validator.validate()

{'name': 'Mario Rossi',
 'city': 'Roma',
 'phone_number': '1234567890',
 'email': 'mariorossi@gmail.com',
 'fiscal_code': 'RSSMRA85M01H501Z'}