In [91]:
from presidio_analyzer import AnalyzerEngine, Pattern, PatternRecognizer, RecognizerRegistry
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_analyzer.predefined_recognizers import EmailRecognizer, IpRecognizer, PhoneRecognizer, IbanRecognizer, CreditCardRecognizer, SpacyRecognizer


In [113]:
class Tran:
    def __init__(self, lang, text):
        self.text = text
        self.lang = lang

        # model configuration for greek and english
        configuration = {"nlp_engine_name": "spacy",
                         "models": [{"lang_code": "el", "model_name": "el_core_news_lg"},
                                    {"lang_code": "en", "model_name": "en_core_web_lg"}],
}

        # NLP engine based on config
        provider = NlpEngineProvider(nlp_configuration=configuration)
        nlp_engine_el_en = provider.create_engine()

        #List of entities to look for
        self.entities = ['PERSON', 'NUMBERS',
                         'EMAIL_ADDRESS', 'IP_ADDRESS',
                         'LOCATION']
        
        # Setting up greek recognizers
        email_recognizer_el = EmailRecognizer(supported_language="el", context=["μειλ"])
        ip_recognizer_el = IpRecognizer(supported_language="el", context=["ip", "IP"])
        phone_recognizer_el = PhoneRecognizer(supported_language="el", context=["τηλέφωνο", "τηλεφωνο", "αριθμός", "αριθμος"])
        iban_recognizer_el = IbanRecognizer(supported_language="el", context=["ιβαν", "iban", "τράπεζα", "τραπεζα"])
        credit_recognizer_el = CreditCardRecognizer(supported_language="el", context=["credit","card","visa","mastercard","cc",
                                                                                      "amex","discover","jcb","diners","maestro","instapayment",
                                                                                      "πιστωτική","πιστωτικη","κάρτα","καρτα"])
        spacy_recognizer_el = SpacyRecognizer(supported_language="el",supported_entities=['PERSON','LOCATION'],)
        numbers_pattern = Pattern(name="numbers_pattern",regex="\d+", score = 0.1)
        number_recognizer = PatternRecognizer(supported_entity="NUMBERS", patterns = [numbers_pattern],supported_language='el')

        # Registry object along with predefined recognizers
        registry = RecognizerRegistry()
        registry.load_predefined_recognizers()

        # Adding custom recognizers to registry
        registry.add_recognizer(email_recognizer_el)
        registry.add_recognizer(ip_recognizer_el)
        registry.add_recognizer(phone_recognizer_el)
        registry.add_recognizer(iban_recognizer_el)
        registry.add_recognizer(number_recognizer)
        registry.add_recognizer(credit_recognizer_el)
        registry.add_recognizer(spacy_recognizer_el)



        # Analyzer and anonymizer objects        
        self.analyzer = AnalyzerEngine(nlp_engine=nlp_engine_el_en, 
                                       supported_languages=["el", "en"],
                                       registry=registry)
        
        self.anonymizer = AnonymizerEngine()
        

        # Can define how the operators will behave for each entity
        self.operators = {            
            "PERSON": OperatorConfig("replace", {"new_value": "<ANONYMOUS>"}),
            
            "NUMBERS": OperatorConfig("replace", {"new_value": "<HIDDEN>"}),
            
            "IP_ADDRESS": OperatorConfig("mask", {"type": "mask","masking_char": "*",
                                                    "chars_to_mask": 12,
                                                    "from_end": True,}),
                                                                
            "LOCATION": OperatorConfig("replace", {"new_value": "<HIDDEN>"}),
            
            "EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "<HIDDEN_EMAIL>"})       
        }


    def anonymize(self):
                
        if self.lang == 'el':
        # Anonymize the translated text
            self.results = self.analyzer.analyze(text=self.text,
                                                entities=self.entities,
                                                language='el')
            self.anon_text = self.anonymizer.anonymize(text=self.text, analyzer_results=self.results,
                                                operators=self.operators)
            return self.anon_text
        
        #Anonymize english text
        elif self.lang == 'en':
            self.results = self.analyzer.analyze(text=self.text,
                                                entities=self.entities,
                                                language='en')
            self.anon_text = self.anonymizer.anonymize(text=self.text, analyzer_results=self.results,
                                                operators=self.operators)
            return self.anon_text
        



In [114]:
text = 'Με λένε Γιάννη'

test = Tran('el', text)

In [115]:
test.anonymize()

text: Με λένε Γιάννη
items:
[
    
]

In [110]:
import spacy

english_nlp = spacy.load('el_core_news_lg')

text = '''
Με λένε Γιάννη
'''

spacy_parser = english_nlp(text)

for entity in spacy_parser.ents:
    print(f'Found: {entity.text} of type: {entity.label_}')

Found: Γιάννη of type: PERSON
