In [41]:
from konfuzio_sdk.data import Document
from konfuzio_sdk.regex import suggest_regex_for_string

# Simple approach to validate user input

## Validate E-Mails
Assume you have a list of valid and invalid emails and want to create a logic to validate this email.

In [42]:
valid_emails = ['abc-d@mail.com', 'abc.def@mail.com', 'abc@mail.com', 'abc_def@mail.com', 'abc.def@mail.cc',
                'abc.def@mail-archive.com', 'abc.def@mail.org', 'abc.def@mail.com']
invalid_emails = ['abc-@mail.com', 'abc..def@mail.com', '.abc@mail.com', 'abc#def@mail.com', 'abc.def@mail.c',
                  'abc.def@mail#archive.com', 'abc.def@mail', 'abc.def@mail..com', 'mailto:name@mail.com']
# as seen on https://help.xmatters.com/ondemand/trial/valid_email_format.htm

In [43]:
email = "hello@konfuzio.c"
suggest_regex_for_string(email, replace_characters=True)

'[a-zäöüß]+\\@[a-zäöüß]+\\.[a-zäöüß]'

This return type is not what we are looking for. So let's replace also the characters.

In [44]:
valid = [suggest_regex_for_string(email, replace_characters=True) for email in valid_emails]
valid

['[a-zäöüß]+[-][a-zäöüß]\\@[a-zäöüß]+\\.[a-zäöüß]+',
 '[a-zäöüß]+\\.[a-zäöüß]+\\@[a-zäöüß]+\\.[a-zäöüß]+',
 '[a-zäöüß]+\\@[a-zäöüß]+\\.[a-zäöüß]+',
 '[a-zäöüß]+_[a-zäöüß]+\\@[a-zäöüß]+\\.[a-zäöüß]+',
 '[a-zäöüß]+\\.[a-zäöüß]+\\@[a-zäöüß]+\\.[a-zäöüß]+',
 '[a-zäöüß]+\\.[a-zäöüß]+\\@[a-zäöüß]+[-][a-zäöüß]+\\.[a-zäöüß]+',
 '[a-zäöüß]+\\.[a-zäöüß]+\\@[a-zäöüß]+\\.[a-zäöüß]+',
 '[a-zäöüß]+\\.[a-zäöüß]+\\@[a-zäöüß]+\\.[a-zäöüß]+']

In [45]:
invalid = [suggest_regex_for_string(email, replace_characters=True) for email in invalid_emails]
invalid

['[a-zäöüß]+[-]\\@[a-zäöüß]+\\.[a-zäöüß]+',
 '[a-zäöüß]+\\.\\.[a-zäöüß]+\\@[a-zäöüß]+\\.[a-zäöüß]+',
 '\\.[a-zäöüß]+\\@[a-zäöüß]+\\.[a-zäöüß]+',
 '[a-zäöüß]+\\#[a-zäöüß]+\\@[a-zäöüß]+\\.[a-zäöüß]+',
 '[a-zäöüß]+\\.[a-zäöüß]+\\@[a-zäöüß]+\\.[a-zäöüß]',
 '[a-zäöüß]+\\.[a-zäöüß]+\\@[a-zäöüß]+\\#[a-zäöüß]+\\.[a-zäöüß]+',
 '[a-zäöüß]+\\.[a-zäöüß]+\\@[a-zäöüß]+',
 '[a-zäöüß]+\\.[a-zäöüß]+\\@[a-zäöüß]+\\.\\.[a-zäöüß]+',
 '[a-zäöüß]+:[a-zäöüß]+\\@[a-zäöüß]+\\.[a-zäöüß]+']

To make sure the validation provides no contradiction we check the list intersection.

In [46]:
set(invalid) & set(valid)

set()

## Combine Regex and read detect all potential matches

In [47]:
from konfuzio_sdk.regex import merge_regex, regex_matches
## Combine Regex
combined = merge_regex(valid)
## Evaluate Regex
regex_matches("Please contact us via mailto:info@konfuzio.com.", combined, overlapped=True)

[{'regex_used': "'(?:[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+[-][a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+[-][a-zäöüß]\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+_[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+)'",
  'regex_group': '0',
  'value': 'info@konfuzio.com',
  'start_offset': 29,
  'end_offset': 46,
  'start_text': 0},
 {'regex_used': "'(?:[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+[-][a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+[-][a-zäöüß]\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+_[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[

In [48]:
from konfuzio_sdk.regex import merge_regex, regex_matches
## Combine Regex
combined = merge_regex(invalid)
## Evaluate Regex
regex_matches("Please contact us via mailto:info@konfuzio.com.", combined, overlapped=True)




[{'regex_used': "'(?:[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\#[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\#[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]|[a-zäöüß]+:[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+[-]\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+)'",
  'regex_group': '0',
  'value': 'mailto:info@konfuzio.com',
  'start_offset': 22,
  'end_offset': 46,
  'start_text': 0},
 {'regex_used': "'(?:[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\#[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\#[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]+|[a-zäöüß]+\\\\.[a-zäöüß]+\\\\@[a-zäöüß]+\\\\.[a-zäöüß]|[a-zäöüß]+:[a-zäöüß]+\\\\@[a-zäöüß]+\\\\