In [7]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 279.3 kB/s eta 0:00:46
     --------------------------------------- 0.1/12.8 MB 774.0 kB/s eta 0:00:17
      -------------------------------------- 0.2/12.8 MB 888.4 kB/s eta 0:00:15
     - -------------------------------------- 0.3/12.8 MB 1.2 MB/s eta 0:00:11
     - -------------------------------------- 0.4/12.8 MB 1.3 MB/s eta 0:00:10
     - -------------------------------------- 0.5/12.8 MB 1.4 MB/s eta 0:00:09
     - -------------------------------------- 0.5/12.8 MB 1.4 MB/s eta 0:00:09
     -- ------------------------------------- 0.6/12

In [8]:
import json
import copy
from faker import Faker
import spacy
from collections import defaultdict

fake = Faker()
nlp = spacy.load("en_core_web_sm")

#json data from synthetic data in rag channel
data_json = [
    {
        "seriousness": 5,
        "police_record": {
            "report_number": 40995708,
            "date_filed": "2024-07-21",
            "reporting_officer": "Lori Johns",
            "incident_type": "Vehicle Collision",
            "location": "36913 Atkinson Parks",
            "description": "Two-vehicle collision involving Nicole Bush. Moderate-speed collision (approximately 27 mph). Significant damage to vehicles.",
            "witness_statements": [
                "Kenneth Green: 'Time health to get face dream accept.'",
                "Sean Mccarthy: 'Miss rather woman until leg produce road.'"
            ]
        },
        "information_sheet": {
            "full_name": "Nicole Bush",
            "date_of_birth": "1960-09-21",
            "address": "00410 James Falls Apt. 816\nJacobton, FM 68526",
            "phone_number": "382.916.1238x3973",
            "email": "hthompson@example.com",
            "occupation": "Video editor",
            "employer": "Williams LLC",
            "insurance_provider": "Matthews-Kelley",
            "policy_number": 6204211294,
            "accident_date": "2024-07-21 00:00",
            "accident_location": "65525 Gentry Ports Suite 328"
        },
        "medical_record": {
            "patient_name": "Nicole Bush",
            "patient_id": 69636128,
            "date_of_birth": "1960-09-21",
            "gender": "Female",
            "accident_date": "2024-07-21 00:00",
            "injuries": [
                "Mild concussion",
                "Moderate concussion",
                "Sprained ankle"
            ]
        }
    }
]

#defining the pii fields that require some direct anonymization
PII_FIELDS = {
    "full_name": "name",
    "date_of_birth": "date_of_birth",
    "address": "address",
    "phone_number": "phone_number",
    "email": "email",
    "patient_name": "name",
    "patient_id": "numeric_id",
    "reporting_officer": "name",
    "location": "address",
    "accident_location": "address",
}

# text feilds that require ner based anonymization
TEXT_FIELDS = [
    ("police_record", "description"),
    ("police_record", "witness_statements")
]

#mapping dictionaries to maintain some consistency
mappings = defaultdict(dict)

def anonymize_field(original_value, field_type):
    
    if original_value in mappings[field_type]:
        return mappings[field_type][original_value]
    else:
        if field_type == "name":
            fake_value = fake.name()
        elif field_type == "address":
            fake_value = fake.address().replace("\n", ", ")
        elif field_type == "phone_number":
            fake_value = fake.phone_number()
        elif field_type == "email":
            fake_value = fake.email()
        elif field_type == "date_of_birth":
            fake_value = fake.date_of_birth(minimum_age=18, maximum_age=90).isoformat()
        elif field_type == "numeric_id":
            fake_value = fake.unique.random_number(digits=8, fix_len=True)
        else:
            fake_value = "ANON"
        mappings[field_type][original_value] = fake_value
        return fake_value

def anonymize_text(text):
    doc = nlp(text)
    anonymized_text = text
    for ent in reversed(doc.ents):  #reverse to replace from the end to not mess up indices
        if ent.label_ in ["PERSON", "GPE", "ORG", "DATE", "ADDRESS"]:
            original = ent.text
            if ent.label_ == "PERSON":
                fake_replacement = anonymize_field(original, "name")
            elif ent.label_ == "GPE" or ent.label_ == "ADDRESS":
                fake_replacement = anonymize_field(original, "address")
            elif ent.label_ == "ORG":
                fake_replacement = fake.company()
            elif ent.label_ == "DATE":
                fake_replacement = fake.date()
            else:
                fake_replacement = "ANON"
            #replacing the entity in the text
            anonymized_text = anonymized_text[:ent.start_char] + fake_replacement + anonymized_text[ent.end_char:]
    return anonymized_text

def traverse_and_anonymize(obj):
    """
    recursively traversing the JSON object and anonymizing fields to handle wide variety of data structures that the json represents
    """
    if isinstance(obj, list):
        return [traverse_and_anonymize(item) for item in obj]
    elif isinstance(obj, dict):
        new_obj = {}
        for key, value in obj.items():
            #Direct anonymization for specific PII fields
            if key in PII_FIELDS:
                field_type = PII_FIELDS[key]
                new_value = anonymize_field(value, field_type)
                new_obj[key] = new_value
            elif key == "accident_date":
                #eplacing with just the year or month
                new_obj[key] = fake.date(pattern="%Y-%m-%d")
            #handling the text fields with NERased anonymization
            elif (key, ) in TEXT_FIELDS or any(key == tf[1] and key in parent for tf in TEXT_FIELDS for parent in obj.keys()):
                parent_key = list(obj.keys())[0] if len(obj.keys()) > 0 else ""
                if isinstance(value, str):
                    new_obj[key] = anonymize_text(value)
                elif isinstance(value, list):
                    new_obj[key] = [anonymize_text(item) if isinstance(item, str) else item for item in value]
                else:
                    new_obj[key] = value
            #recursively handling nested dictionary orlists
            elif isinstance(value, dict) or isinstance(value, list):
                new_obj[key] = traverse_and_anonymize(value)
            else:
                new_obj[key] = value
        return new_obj
    else:
        return obj

#deep copy of the data
anonymized_data = traverse_and_anonymize(copy.deepcopy(data_json))

#output
print("Anonymized JSON Data:")
print(json.dumps(anonymized_data, indent=2))

Anonymized JSON Data:
[
  {
    "seriousness": 5,
    "police_record": {
      "report_number": 40995708,
      "date_filed": "2024-07-21",
      "reporting_officer": "Benjamin Jefferson",
      "incident_type": "Vehicle Collision",
      "location": "5724 Gomez Way Apt. 399, Maloneville, PR 18446",
      "description": "Two-vehicle collision involving Dale Nguyen. Moderate-speed collision (approximately 27 mph). Significant damage to vehicles.",
      "witness_statements": [
        "Sean Brown 'Time health to get face dream accept.'",
        "Kelsey Moore: 'Jonathan Ibarra rather woman until leg produce road.'"
      ]
    },
    "information_sheet": {
      "full_name": "Dale Nguyen",
      "date_of_birth": "1957-06-16",
      "address": "190 Thomas Plains, Coleville, ME 39617",
      "phone_number": "001-611-867-5624",
      "email": "bensontimothy@example.net",
      "occupation": "Video editor",
      "employer": "Williams LLC",
      "insurance_provider": "Matthews-Kelley",
   