In [1]:
import translators as ts
import translators.server as tss
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorResult, OperatorConfig
from presidio_analyzer import PatternRecognizer
import random


Using region Attica server backend.


In [86]:
class Tran:
    def __init__(self, lang, text):
        self.text = text
        self.lang = lang
     
    def translate(self):
        #translates from greek, romanian, or italian to english 
        self.translated_txt = ts.translate_text(self.text, translater='google',
                                            from_language=self.lang,
                                            to_language='en')
        return self.translated_txt


    def anonymize(self):
        #List of entities to look for
        entities = ['PERSON','PHONE_NUMBER','CREDIT_CARD',
                   'EMAIL_ADDRESS', 'IP_ADDRESS', 'NRP',
                   'LOCATION', 'BLOOD_TYPE']
        
        self.analyzer = AnalyzerEngine()
        self.anonymizer = AnonymizerEngine()
        
        # Option to create custom recognizers
        blood_type_recognizer = PatternRecognizer(supported_entity="BLOOD_TYPE",
                                      deny_list=["A-","A+","B-","B+","AB-","AB+","O-","O+"])

        self.analyzer.registry.add_recognizer(blood_type_recognizer)

        # Can define how the operators will behave for each entity
        operators = {
            "DEFAULT": OperatorConfig("replace", {"new_value": "<ANONYMIZED>"}),
            
            "PERSON": OperatorConfig("replace", {"new_value": "<ANONYMOUS>"}),
            
            "CREDIT_CARD": OperatorConfig("mask",{"type":"mask","masking_char": "*",
                                                 "chars_to_mask": 12,
                                                 "from_end": True}),
            
            "PHONE_NUMBER": OperatorConfig("mask", {"type": "mask","masking_char": "*",
                                                    "chars_to_mask": 8,
                                                    "from_end": False,}),
            
            "IP_ADDRESS": OperatorConfig("mask", {"type": "mask","masking_char": "*",
                                                    "chars_to_mask": 12,
                                                    "from_end": True,}),
            "NRP": OperatorConfig("redact"),
            
            "LOCATION": OperatorConfig("replace", {"new_value": "<HIDDEN>"}),
            
            "EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "<HIDDEN_EMAIL>"}),
            
            "BLOOD_TYPE": OperatorConfig("replace", {"new_value": "<HIDDEN_BLOOD_TYPE>"})
        
        }
        
        # Anonymize the translated text
        self.results = self.analyzer.analyze(text=self.translated_txt,
                                            entities=entities,
                                            language='en',
                                            return_decision_process=True)
        self.anon_text = self.anonymizer.anonymize(text=self.translated_txt, analyzer_results=self.results,
                                            operators=operators)
        return self.anon_text.text
        
    def tran_back(self):
        # Translate text back to original language
        self.tran_back_text = ts.translate_text(self.anon_text.text, translater='google',
                                            from_language='en',
                                            to_language=self.lang)
        return self.tran_back_text

        
    

In [164]:
greek_text = 'Με λένε Γιάννη'

english_text = ts.translate_text(greek_text, translater='google',
                                            from_language='el',
                                            to_language='en')

analyzer = AnalyzerEngine()

results = analyzer.analyze(text=english_text, 
                        language='en', 
                        return_decision_process=True)

start = results[0].to_dict().get('start')
end = results[0].to_dict().get('end')

anonymizer = AnonymizerEngine()

anon_text = anonymizer.anonymize(text=english_text, analyzer_results=results)

pii_translated = ts.translate_text(english_text[start:end], translater='google',
                                from_language='en',
                                to_language='el')

greek_text.replace(pii_translated, anon_text.items[0].text)

'Με λένε Γιάννη'

In [162]:
anon_text.items[0].text

'<LOCATION>'

In [132]:
results[0]

type: PERSON, start: 11, end: 15, score: 0.85

In [85]:
obj.analyzer.analyze(text=txt,
                     language='en',
                     return_decision_process=True)[0].to_dict()


[2023-06-26 19:06:12,462][decision_process][INFO][None][nlp artifacts:{"entities": ["Ioannis"], "tokens": ["My", "name", "is", "Ioannis"], "lemmas": ["my", "name", "be", "Ioannis"], "tokens_indices": [0, 3, 8, 11], "keywords": ["ioannis"]}]
[2023-06-26 19:06:12,465][decision_process][INFO][None][["{'entity_type': 'PERSON', 'start': 11, 'end': 18, 'score': 0.85, 'analysis_explanation': {'recognizer': 'SpacyRecognizer', 'pattern_name': None, 'pattern': None, 'original_score': 0.85, 'score': 0.85, 'textual_explanation': \"Identified as PERSON by Spacy's Named Entity Recognition\", 'score_context_improvement': 0, 'supportive_context_word': '', 'validation_result': None}, 'recognition_metadata': {'recognizer_name': 'SpacyRecognizer', 'recognizer_identifier': 'SpacyRecognizer_2478753270944'}}"]]


{'entity_type': 'PERSON',
 'start': 11,
 'end': 18,
 'score': 0.85,
 'analysis_explanation': {'recognizer': 'SpacyRecognizer', 'pattern_name': None, 'pattern': None, 'original_score': 0.85, 'score': 0.85, 'textual_explanation': "Identified as PERSON by Spacy's Named Entity Recognition", 'score_context_improvement': 0, 'supportive_context_word': '', 'validation_result': None},
 'recognition_metadata': {'recognizer_name': 'SpacyRecognizer',
  'recognizer_identifier': 'SpacyRecognizer_2478753270944'}}

In [42]:
import re

re.findall(r"<(.*?)>",obj.anon_text.text)

['ANONYMOUS']

In [31]:
def process_user_input():

    languages = ['el','ro','it']
    exit_phrases = ['goodbye','bye','good bye','see you']
    chatbot_prompts = ['What is your name?', 'I need your credit card number to make this purchase',
                       'What email address should I send this to?', 'Tell me about yourself',
                       'What number should they call you at?', 'Where do you live?',
                       'What is your blood type?','What IP address are you connected from?']

    lang_input = str(input('Choose language: \n\
    1) Greek \n\
    2) Romanian \n\
    3) Italian \n'))
    if lang_input.lower() == 'greek' or lang_input == "1":
        chosen_lang = languages[0]
    elif lang_input.lower() == 'romanian' or lang_input == "2":
        chosen_lang = languages[1]
    elif lang_input.lower() == 'italian' or lang_input == "3":
        chosen_lang = languages[2]
    else:
        print('Language not supported')
    while True:
        user_input = str(input(f'{random.choice(chatbot_prompts)}: '))
        user_tran = Tran(chosen_lang, user_input)
        user_tran.translate()
        if user_tran.translated_txt.lower() in exit_phrases:
            print('Goodbye!')
            break
        user_tran.anonymize()
        print(user_tran.tran_back())
        

In [33]:
process_user_input()

Ονομάζομαι __________ <ANONYMOUS>
ok αριθμός πιστωτικής κάρτας μου είναι 5167 32*********
Ονομάζομαι __________ <ANONYMOUS>
Στείλτε το στο <HIDDEN_EMAIL>
Είμαι συνδεδεμένος από ***********
Είμαι α και ψηφίζω υπέρ
Στείλτε το στο <HIDDEN_EMAIL>
καλέστε με στο ********5121
Goodbye!
