In [60]:
import random
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer.predefined_recognizers import EmailRecognizer, IpRecognizer, PhoneRecognizer, IbanRecognizer, CreditCardRecognizer, SpacyRecognizer

In [117]:
class PII:
    def __init__(self, lang: str, text: str, dummies: bool):
        triple_text = ''' 
        '''
        self.text = triple_text + text
        self.lang = lang
        self.dummies = dummies

        # model configuration for greek and english
        configuration = {"nlp_engine_name": "spacy",
                         "models": [{"lang_code": "el", "model_name": "el_core_news_lg"},
                                    {"lang_code": "en", "model_name": "en_core_web_lg"}]
                        }

        # NLP engine based on config
        provider = NlpEngineProvider(nlp_configuration=configuration)
        nlp_engine_el_en = provider.create_engine()

        #List of entities to look for
        self.entities = ['NUMBERS_en', 'NUMBERS_el', 'PERSON',
                         'EMAIL_ADDRESS', 'IP_ADDRESS',
                         'LOCATION']
        
        # Setting up greek recognizers
        email_recognizer_el = EmailRecognizer(supported_language="el", context=["μειλ"])
        ip_recognizer_el = IpRecognizer(supported_language="el", context=["ip", "IP"])
        phone_recognizer_el = PhoneRecognizer(supported_language="el", context=["τηλέφωνο", "τηλεφωνο", "αριθμός", "αριθμος"])
        iban_recognizer_el = IbanRecognizer(supported_language="el", context=["ιβαν", "iban", "τράπεζα", "τραπεζα"])
        credit_recognizer_el = CreditCardRecognizer(supported_language="el", context=["credit","card","visa","mastercard","cc",
                                                                                      "amex","discover","jcb","diners","maestro","instapayment",
                                                                                      "πιστωτική","πιστωτικη","κάρτα","καρτα"])
        spacy_recognizer_el = SpacyRecognizer(supported_language="el")
        numbers_pattern = Pattern(name="numbers_pattern",regex=r"\d+", score = 0.2)
        number_recognizer_en = PatternRecognizer(supported_entity="NUMBERS_en", patterns = [numbers_pattern],supported_language='en')
        number_recognizer_el = PatternRecognizer(supported_entity="NUMBERS_el", patterns = [numbers_pattern],supported_language='el')


        # Registry object along with predefined recognizers
        registry = RecognizerRegistry()
        registry.load_predefined_recognizers()

        # Adding custom recognizers to registry
        registry.add_recognizer(email_recognizer_el)
        registry.add_recognizer(ip_recognizer_el)
        registry.add_recognizer(phone_recognizer_el)
        registry.add_recognizer(iban_recognizer_el)
        registry.add_recognizer(number_recognizer_el)
        registry.add_recognizer(number_recognizer_en)
        registry.add_recognizer(credit_recognizer_el)
        registry.add_recognizer(spacy_recognizer_el)

        # Analyzer and anonymizer objects        
        self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine_el_en, 
                                       supported_languages=["el", "en"],
                                       registry=registry)
        
        self.anonymizer = AnonymizerEngine()
        

        # Can define how the operators will behave for each entity. Depends on mode.
        if self.dummies == False:
            self.operators = {            
                "PERSON": OperatorConfig("replace", {"new_value": "<ANONYMOUS>"}),
                
                "NUMBERS_en": OperatorConfig("replace", {"new_value": "<HIDDEN_NUMBER>"}),

                "NUMBERS_el": OperatorConfig("replace", {"new_value": "<HIDDEN_NUMBER>"}),
                
                "IP_ADDRESS": OperatorConfig("mask", {"type": "mask","masking_char": "*",
                                                        "chars_to_mask": 12,
                                                        "from_end": True,}),
                                                                    
                "LOCATION": OperatorConfig("replace", {"new_value": "<HIDDEN_LOCATION>"}),
                
                "EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "<HIDDEN_EMAIL>"}),       
            }
        
        elif self.dummies == True:
             self.operators = {            
                "PERSON": OperatorConfig("replace", {"new_value": "John Smith"}),
                
                "NUMBERS_en": OperatorConfig("replace", {"new_value": "0000"}),

                "NUMBERS_el": OperatorConfig("replace", {"new_value": "0000"}),
                
                "IP_ADDRESS": OperatorConfig("mask", {"type": "mask","masking_char": "*",
                                                        "chars_to_mask": 12,
                                                        "from_end": True,}),
                                                                    
                "LOCATION": OperatorConfig("replace", {"new_value": random.choice(["Athens","Nicosia","New York"])}),
                
                "EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "example@mail.com"})       
            }


    def anonymize(self):
        self.results = self.analyzer.analyze(text=self.text,
                                            entities=self.entities,
                                            language=self.lang,return_decision_process=True)
        
        self.anon_text = self.anonymizer.anonymize(text=self.text, analyzer_results=self.results,
                                            operators=self.operators) 
        return self.anon_text.text
    
    def array(self):
        entity_array = []
        for i in range(len(self.results)):
            entity_array.append(self.results[i].to_dict())
        return entity_array

def anonymize_user_input(lang, text, mode):
    user_input = PII(lang, text, mode)
    user_input.anonymize()
    user_input.array()

In [94]:
el = ['Ζω στο Ηράκλειο.',
     'Με λένε Γιάννη.',
     'Ο αριθμός της κάρτας μου είναι 5121892130209810.',
     'Το ΙΒΑΝ μου είναι GR63026137302100270290297088.',
     'Η ημερομηνία γεννησής μου είναι 23/01/1950.']

en = ['I live in Greece.',
     'My name is John.',
     'My number is 2310-555-999.',
     'My IP address is 192.168.0.0.',
     'I hope to travel from Cyprus to Rome for vacation.']

In [101]:
for sentence in el:
    test = PII('el',sentence, False)
    df.loc[len(df)] = ['el', sentence, False, test.anonymize(), test.array()]

In [102]:
df

Unnamed: 0,Language,Text,Dummies,Final Text,Entities Found
0,en,I live in Greece.,True,\n I live in Nicosia.,"[{'entity_type': 'LOCATION', 'start': 20, 'end..."
1,en,My name is John.,True,\n My name is John Smith.,"[{'entity_type': 'PERSON', 'start': 21, 'end':..."
2,en,My number is 2310-555-999.,True,\n My number is 0000-0000-0000.,"[{'entity_type': 'NUMBERS_en', 'start': 23, 'e..."
3,en,My IP address is 192.168.0.0.,True,\n My IP address is ***********.,"[{'entity_type': 'IP_ADDRESS', 'start': 27, 'e..."
4,en,I hope to travel from Cyprus to Rome for vacat...,True,\n I hope to travel from Nicosia to Ni...,"[{'entity_type': 'LOCATION', 'start': 32, 'end..."
5,en,I live in Greece.,False,\n I live in <HIDDEN_LOCATION>.,"[{'entity_type': 'LOCATION', 'start': 20, 'end..."
6,en,My name is John.,False,\n My name is <ANONYMOUS>.,"[{'entity_type': 'PERSON', 'start': 21, 'end':..."
7,en,My number is 2310-555-999.,False,\n My number is <HIDDEN_NUMBER>-<HIDDE...,"[{'entity_type': 'NUMBERS_en', 'start': 23, 'e..."
8,en,My IP address is 192.168.0.0.,False,\n My IP address is ***********.,"[{'entity_type': 'IP_ADDRESS', 'start': 27, 'e..."
9,en,I hope to travel from Cyprus to Rome for vacat...,False,\n I hope to travel from <HIDDEN_LOCAT...,"[{'entity_type': 'LOCATION', 'start': 32, 'end..."


In [108]:
import re

df['Final Text'] = df['Final Text'].apply(lambda x: re.sub(r'\n','',x).strip())

In [114]:
df.to_excel('tests.xlsx')

In [116]:
df['Entities Found'][7]

[{'entity_type': 'NUMBERS_en',
  'start': 23,
  'end': 27,
  'score': 0.5,
  'analysis_explanation': {'recognizer': 'PatternRecognizer', 'pattern_name': 'numbers_pattern', 'pattern': '\\d+', 'original_score': 0.5, 'score': 0.5, 'textual_explanation': None, 'score_context_improvement': 0, 'supportive_context_word': '', 'validation_result': None},
  'recognition_metadata': {'recognizer_name': 'PatternRecognizer',
   'recognizer_identifier': 'PatternRecognizer_1821961745168'}},
 {'entity_type': 'NUMBERS_en',
  'start': 28,
  'end': 31,
  'score': 0.5,
  'analysis_explanation': {'recognizer': 'PatternRecognizer', 'pattern_name': 'numbers_pattern', 'pattern': '\\d+', 'original_score': 0.5, 'score': 0.5, 'textual_explanation': None, 'score_context_improvement': 0, 'supportive_context_word': '', 'validation_result': None},
  'recognition_metadata': {'recognizer_name': 'PatternRecognizer',
   'recognizer_identifier': 'PatternRecognizer_1821961745168'}},
 {'entity_type': 'NUMBERS_en',
  'start'