In [None]:
pip install -U pregex==2.0.1

In [2]:
from pregex.core.quantifiers import Optional
from pregex.core.classes import AnyFrom, AnyDigit

In [40]:
from pregex.core.classes import AnyButWhitespace
from pregex.core.quantifiers import OneOrMore
from pregex.core.operators import Either

text = "You can find me through GitHub https://github.com/khuyentran1401"

pre = (
    "https://"
    + OneOrMore(AnyButWhitespace())
)

pre.get_matches(text)

['https://github.com/khuyentran1401']

## Capture URL

In [41]:
import re

text = "You can find me through my website mathdatasimplified.com/ or GitHub https://github.com/khuyentran1401"
re.findall("(?:https?\:\/\/)?[^\s]+(?:\.com|\.org)[^\s]+", text)


['mathdatasimplified.com/', 'https://github.com/khuyentran1401']

In [42]:
text = "You can find me through GitHub https://github.com/khuyentran1401"

pre = (
    "https://"
    + OneOrMore(AnyButWhitespace())
)
pre

https:\/\/\S+

In [43]:
pre.get_matches(text)

['https://github.com/khuyentran1401']

In [44]:
text = "You can find me through GitHub http://github.com/khuyentran1401"

pre = (
    "http"
    + Optional("s")
    + "://"
    + OneOrMore(AnyButWhitespace())
)
pre.get_matches(text)

['http://github.com/khuyentran1401']

In [45]:
text = "You can find me through my website mathdatasimplified.com/ or GitHub https://github.com/khuyentran1401"

at_least_one_character_except_white_space = OneOrMore(AnyButWhitespace())
pre = (
    Optional("http" + Optional("s") + "://")
    + at_least_one_character_except_white_space
    + Either(".com", ".org")
    + at_least_one_character_except_white_space
)
pre.get_matches(text)

['mathdatasimplified.com/', 'https://github.com/khuyentran1401']

In [46]:
pre.get_pattern()

'(?:https?:\\/\\/)?\\S+(?:\\.com|\\.org)\\S+'

## Capture Time

In [47]:
pre = AnyDigit()
text = "It is 6:00 pm now"
pre.get_matches(text)

['6', '0', '0']

In [48]:
pre = AnyDigit() + ":" + AnyDigit()
pre.get_matches(text)

['6:0']

In [49]:
pre = OneOrMore(AnyDigit()) + AnyFrom(":") + OneOrMore(AnyDigit())
pre.get_matches(text)

['6:00']

## Capture Phone Numbers

```
###-###-####
(###) ###-####
### ### ####
###.###.####
```

In [50]:
text = "My phone number is 3452352312 or 345-235-2312 or 345 235 2312 or 345.235.2312"

punctuation = AnyFrom("-", " ", ".")
optional_punctuation = Optional(punctuation)
at_least_one_digit = OneOrMore(AnyDigit())

pre = (
    at_least_one_digit
    + optional_punctuation
    + at_least_one_digit
    + optional_punctuation
    + at_least_one_digit
)
pre.get_matches(text)

['3452352312', '345-235-2312', '345 235 2312', '345.235.2312']

In [51]:
pre

\d+[\--. ]?\d+[\--. ]?\d+

In [52]:
text = "My phone number is 3452352312 or 345-235-2312 or (345) 235-2312 or 345 235 2312 or 345.235.2312"

punctuation = Either("-", " ", ".")
optional_punctuation = Optional(punctuation)
at_least_one_digit = OneOrMore(AnyDigit())

pre = (
    Optional("(")
    + at_least_one_digit
    + Optional(")")
    + optional_punctuation
    + at_least_one_digit
    + optional_punctuation
    + at_least_one_digit
)
pre.get_matches(text)

['3452352312',
 '345-235-2312',
 '(345) 235-2312',
 '345 235 2312',
 '345.235.2312']

In [53]:
pre

\(?\d+\)?(?:-| |\.)?\d+(?:-| |\.)?\d+

## Capture Email Address

In [54]:
text = "My email is abcd@gmail.com"

pre = (
    OneOrMore(AnyButWhitespace())
    + "@"
    + OneOrMore(AnyButWhitespace())
)

pre.get_matches(text)

['abcd@gmail.com']