In [None]:
pip install -U pregex==2.0.1

In [3]:
from pregex.core.quantifiers import Optional, OneOrMore, AtLeastAtMost
from pregex.core.classes import AnyFrom, AnyDigit, AnyWhitespace, AnyButWhitespace, Any
from pregex.core.tokens import Backslash
from pregex.core.operators import Either
from pregex.core.pre import Pregex

In [16]:
from pregex.core.classes import AnyButWhitespace
from pregex.core.quantifiers import OneOrMore
from pregex.core.operators import Either

text = "You can find me through GitHub https://github.com/khuyentran1401"

pre = (
    "https://"
    + OneOrMore(AnyButWhitespace())
    + Either(".com", ".org")
    + OneOrMore(AnyButWhitespace())
)

## Capture URL

In [5]:
import re

text = "You can find me through my website mathdatasimplified.com/ or GitHub https://github.com/khuyentran1401"
re.findall("(?:https?\:\/\/)?[^\s]+(?:\.com|\.org)[^\s]+", text)


['mathdatasimplified.com/', 'https://github.com/khuyentran1401']

<IPython.core.display.Javascript object>

In [4]:
text = "You can find me through GitHub https://github.com/khuyentran1401"

pre = (
    "https://"
    + OneOrMore(AnyButWhitespace())
    + Either(".com", ".org")
    + OneOrMore(AnyButWhitespace())
)
pre

https:\/\/\S+(?:\.com|\.org)\S+

In [11]:
pre.get_matches(text)

['https://github.com/khuyentran1401']

<IPython.core.display.Javascript object>

In [5]:
text = "You can find me through GitHub http://github.com/khuyentran1401"

pre = (
    "http"
    + Optional("s")
    + "://"
    + OneOrMore(AnyButWhitespace())
    + Either(".com", ".org")
    + OneOrMore(AnyButWhitespace())
)
pre.get_matches(text)

['http://github.com/khuyentran1401']

In [6]:
text = "You can find me through my website mathdatasimplified.com/ or GitHub https://github.com/khuyentran1401"

at_least_one_character_except_white_space = OneOrMore(AnyButWhitespace())
pre = (
    Optional("http" + Optional("s") + "://")
    + at_least_one_character_except_white_space
    + Either(".com", ".org")
    + at_least_one_character_except_white_space
)
pre.get_matches(text)

['mathdatasimplified.com/', 'https://github.com/khuyentran1401']

In [14]:
pre

(?:https?\:\/\/)?[^\s]+(?:\.com|\.org)[^\s]+

<IPython.core.display.Javascript object>

## Capture Time

In [8]:
pre = AnyDigit()
text = "It is 6:00 pm now"
pre.get_matches(text)

['6', '0', '0']

In [9]:
pre = AnyDigit() + ":" + AnyDigit()
pre.get_matches(text)

['6:0']

In [7]:
pre = OneOrMore(AnyDigit()) + AnyFrom(":") + OneOrMore(AnyDigit())
pre.get_matches(text)

[]

## Capture Phone Numbers

```
###-###-####
(###) ###-####
### ### ####
###.###.####
```

In [10]:
text = "My phone number is 3452352312 or 345-235-2312 or 345 235 2312 or 345.235.2312"

punctuation = AnyFrom("-", " ", ".")
optional_punctuation = Optional(punctuation)
at_least_one_digit = OneOrMore(AnyDigit())

pre = (
    at_least_one_digit
    + optional_punctuation
    + at_least_one_digit
    + optional_punctuation
    + at_least_one_digit
)
pre.get_matches(text)

['3452352312', '345-235-2312', '345 235 2312', '345.235.2312']

In [11]:
pre

\d+[\--. ]?\d+[\--. ]?\d+

In [13]:
text = "My phone number is 3452352312 or 345-235-2312 or (345) 235-2312 or 345 235 2312 or 345.235.2312"

punctuation = AnyFrom("-", " ", ".")
optional_punctuation = Optional(punctuation)
at_least_one_digit = OneOrMore(AnyDigit())

pre = (
    Optional("(")
    + at_least_one_digit
    + Optional(")")
    + optional_punctuation
    + at_least_one_digit
    + optional_punctuation
    + at_least_one_digit
)
pre.get_matches(text)

['3452352312',
 '345-235-2312',
 '(345) 235-2312',
 '345 235 2312',
 '345.235.2312']

In [14]:
pre

\(?\d+\)?[\--. ]?\d+[\--. ]?\d+

## Capture Email Address

In [15]:
text = "My email is abcd@gmail.com"

pre = (
    OneOrMore(AnyButWhitespace())
    + "@"
    + OneOrMore(Any())
    + Either(".com", ".org", ".io", ".net")
)

pre.get_matches(text)

['abcd@gmail.com']