**First run these uitility functions...**

In [2]:
from typing import List

from presidio_analyzer import EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpEngine, SpacyNlpEngine, NlpArtifacts

class SpacyMagic(object):
    """
    Simple Spacy Magic to minimize loading time.
    >>> SpacyMagic.get("en")
    <spacy.en.English ...
    """
    _spacys = {}

    @classmethod
    def get(cls, lang):
        if lang not in cls._spacys:
            import spacy

            from typing import List, Optional, Dict, Any
            from spacy.pipeline.sentencizer import Sentencizer

            nlp = spacy.load(
                lang
            )

            # Add sentencizer
            def make_sentencizer_config(repeats: int = 4, extra_punct_chars: Optional[List] = None) -> Dict[str, Any]:
                punct_chars = []
                for i in range(1, repeats + 1):
                    for char in Sentencizer.default_punct_chars:
                        punct_chars.append(char * i)

                    if extra_punct_chars:
                        for char in extra_punct_chars:
                            punct_chars.append(char * i)

                return {"punct_chars": punct_chars}
            
            sentencizer_config = make_sentencizer_config(repeats=4, extra_punct_chars=["\n"])
            nlp.add_pipe(factory_name="sentencizer", config=sentencizer_config, before="parser")
            cls._spacys[lang] = nlp
        return cls._spacys[lang]

class OrgRecognizer(EntityRecognizer):
    
    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts) -> List[RecognizerResult]:
        """
        Analyzes test to find tokens which represent company names and organizations.
        """
        results = []

        current_ent = None
        # iterate over the spaCy tokens, and call `token.ent_type_`
        for token in nlp_artifacts.tokens:
            if token.ent_type_ == "ORG":
                if current_ent is None:
                    current_ent = (token.idx, token.idx + len(token))
                else:
                    add = token.idx - current_ent[1]
                    current_ent = (current_ent[0], current_ent[1] + len(token) + add)  # add whitespace
            elif current_ent:
                start, end = current_ent
                result = RecognizerResult(
                    entity_type="ORG",
                    start=start,
                    end=end,
                    score=0.99,
                )
                results.append(result)
                current_ent = None

        if current_ent:
            start, end = current_ent
            result = RecognizerResult(
                entity_type="ORG",
                start=start,
                end=end,
                score=0.99,
            )
            results.append(result)
        return results

class AnalyzerMagic(object):
    """
    Simple magic to minimize loading time for Presidio Analyzer.
    """

    _analyzer = None

    @classmethod
    def get(cls, lang: str):
        if not cls._analyzer:
            import spacy
            from presidio_analyzer import AnalyzerEngine
            from presidio_analyzer.nlp_engine import SpacyNlpEngine
            import tldextract

            # Make sure that tldextract does not try to fetch data
            no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=())

            # Define internal class inheriting from SpacyNlpEngine
            class LoadedSpacyNlpEngine(SpacyNlpEngine):
                def __init__(self, loaded_spacy_model):
                    self.nlp = {"en": loaded_spacy_model}
            
            # Load spacy model
            nlp = SpacyMagic.get(lang)
            loaded_nlp_engine = LoadedSpacyNlpEngine(loaded_spacy_model=nlp)

            # Create Presidio Analyzer
            analyzer = AnalyzerEngine(
                nlp_engine=loaded_nlp_engine
            )

            # Create org recognizer
            org_recognizer = OrgRecognizer(supported_entities=["ORG"])
            analyzer.registry.add_recognizer(org_recognizer)

            # Set Presidio Analyzer
            cls._analyzer = analyzer
        
        # Return Presidio Analyzer
        return cls._analyzer
    
    @staticmethod
    def library_config()-> None:
        import os
        import tldextract

        # set timeout variable
        os.environ["TLDEXTRACT_CACHE_TIMEOUT"] = "1.0"

        # Make sure that tldextract does not try to fetch data
        tldextract.tldextract.TLD_EXTRACTOR.suffix_list_urls = ()
        no_fetch_extract = tldextract.TLDExtract(suffix_list_urls=())

class AnonymizerMagic(object):
    """
    Simple magic to minimize loading time for Presidio Anonymizer.
    """

    _anonymizer = None

    @classmethod
    def get(cls):
        if not cls._anonymizer:
            from presidio_anonymizer import AnonymizerEngine

            # Create Presidio Anonymizer
            anonymizer = AnonymizerEngine()

            # Set Presidio Anonymizer
            cls._anonymizer = anonymizer
        
        # Return Presidio Anonymizer
        return cls._anonymizer



StatementMeta(sparkm, 17, 2, Finished, Available)

**This is the main anonymisation function. Customise as necessary...**

In [3]:
import pyspark.sql.functions as F
import hashlib
from pyspark.sql.types import *


from presidio_anonymizer.entities import OperatorConfig

@F.udf(returnType=StringType())
def anonymizeText(text):
    # Check if text null or empty
    if not text or len(text) <= 0:
        return text

    # Get Presidio Anonymizer and Analzer
    AnalyzerMagic.library_config()
    analyzer = AnalyzerMagic.get("en_core_web_lg")
    anonymizer = AnonymizerMagic.get()

    # Analyze text
    analyzer_results = analyzer.analyze(
        text=text,
        entities=[
            "CREDIT_CARD",
            "CRYPTO",
            "EMAIL_ADDRESS",
            "IBAN_CODE",
            "PERSON",
            "PHONE_NUMBER",
            "MEDICAL_LICENSE",
            "URL",
            "US_BANK_NUMBER",
            "US_DRIVER_LICENSE",
            "US_ITIN",
            "US_PASSPORT",
            "US_SSN",
            "UK_NHS",
            "NIF",
            "FIN/NRIC",
            "AU_ABN",
            "AU_ACN",
            "AU_TFN",
            "AU_MEDICARE",
            "ORG"
        ],
        language="en"
    )

    # Define mapping
    mapping = {
        "CREDIT_CARD": "creditcard",
        "CRYPTO": "crypto",
        "EMAIL_ADDRESS": "email",
        "IBAN_CODE": "iban",
        "IP_ADDRESS": "ipaddress",
        "LOCATION": "location",
        "PERSON": "person",
        "PHONE_NUMBER": "phone",
        "MEDICAL_LICENSE": "medical",
        "URL": "url",
        "US_BANK_NUMBER": "usbank",
        "US_DRIVER_LICENSE": "usdriver",
        "US_ITIN": "usitin",
        "US_PASSPORT": "uspassport",
        "US_SSN": "usssn",
        "UK_NHS": "uknhs",
        "NIF": "nif",
        "FIN/NRIC": "finnric",
        "AU_ABN": "auabn",
        "AU_ACN": "auacn",
        "AU_TFN": "autfn",
        "AU_MEDICARE": "usmedicare",
        "DEFAULT": "other",
        "ORG": "org"
    }

    def get_placeholder(operator: str, item: str)-> str:
        # Get mapping
        placeholder_mapping = mapping[operator]

        # Create hash
        item_hash = hashlib.sha1(item.encode("UTF-8")).hexdigest()
        chars_hash = ''.join([i for i in item_hash if not i.isdigit()])
        lower_hash = chars_hash.lower()+ chars_hash.lower()+ chars_hash.lower()
        upper_hash = chars_hash.upper()+chars_hash.upper()+chars_hash.upper()
        #Substitute characters in hash for all alpha characters. Note this will be different for every cell because the hash is always different
        hashtable = str.maketrans("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ", lower_hash[:26]+upper_hash[:26])

        return_hash =  item.translate(hashtable)
        placeholder = f"{return_hash}"

        return placeholder


    # Anonymize Text
    try:
        anonymizer_result = anonymizer.anonymize(
            text=text,
            analyzer_results=[RecognizerResult('DEFAULT', 0, len(text), 0.85)],#analyzer_results,
            operators={
                "CREDIT_CARD": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("CREDIT_CARD", x)}),
                "CRYPTO": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("CRYPTO", x)}),
                "EMAIL_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("EMAIL_ADDRESS", x)}),
                "IBAN_CODE": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("IBAN_CODE", x)}),
                "IP_ADDRESS": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("IP_ADDRESS", x)}),
                "LOCATION": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("LOCATION", x)}),
                "PERSON": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("PERSON", x)}),
                "PHONE_NUMBER": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("PHONE_NUMBER", x)}),
                "MEDICAL_LICENSE": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("MEDICAL_LICENSE", x)}),
                "URL": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("URL", x)}),
                "US_BANK_NUMBER": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("US_BANK_NUMBER", x)}),
                "US_DRIVER_LICENSE": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("US_DRIVER_LICENSE", x)}),
                "US_ITIN": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("US_ITIN", x)}),
                "US_PASSPORT": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("US_PASSPORT", x)}),
                "US_SSN": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("US_SSN", x)}),
                "UK_NHS": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("UK_NHS", x)}),
                "NIF": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("NIF", x)}),
                "FIN/NRIC": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("FIN/NRIC", x)}),
                "AU_ABN": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("AU_ABN", x)}),
                "AU_ACN": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("AU_ACN", x)}),
                "AU_TFN": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("AU_TFN", x)}),
                "AU_MEDICARE": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("AU_MEDICARE", x)}),
                "DEFAULT": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("DEFAULT", x)}),
                "ORG": OperatorConfig("custom", {"lambda": lambda x: get_placeholder("ORG", x)})
            },
        )
        return anonymizer_result.text
    except:
        return "Exception"


StatementMeta(sparkm, 17, 3, Finished, Available)

**Create sample data and show output before running the anonymisation routine...**

In [7]:
from pyspark.sql.types import StructType,StructField, StringType, IntegerType
data2 = [("James","","Smith","36636","M",3000),
    ("Michael","John","Riley","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([ \
    StructField("firstname",StringType(),True), \
    StructField("middlename",StringType(),True), \
    StructField("lastname",StringType(),True), \
    StructField("id", StringType(), True), \
    StructField("gender", StringType(), True), \
    StructField("salary", IntegerType(), True) \
  ])
 
df = spark.createDataFrame(data=data2,schema=schema)
#df.printSchema()
#df.show(truncate=False)
display(df)

StatementMeta(sparkm, 17, 7, Finished, Available)

SynapseWidget(Synapse.DataFrame, 0a414ff9-c952-4319-bee8-46cf79b41d44)

**Now specify the columns to be anonymised and run the anonymisation routine for each of the columns...**

In [8]:
columnstoanonymize = ['firstname','lastname']
for col_name in df.columns:
    for columntoanonymize in columnstoanonymize:
       
        if col_name == columntoanonymize:
            df = df.withColumn(
                col_name, anonymizeText(F.col(col_name)))

display(df)

StatementMeta(sparkm, 17, 8, Finished, Available)

SynapseWidget(Synapse.DataFrame, 6a145481-808b-4afc-ae84-df166cd9c79b)