In [1]:
from pregex.classes import AnyButWhitespace
from pregex.quantifiers import OneOrMore
from pregex.operators import Either

# Example 1 : Capture URLs

In [2]:
text = "You can find me through GitHub https://github.com/khuyentran1401"

In [3]:
pre = (
    "https://" +
    OneOrMore(AnyButWhitespace()) +
    Either(".com", ".org") +
    OneOrMore(AnyButWhitespace())
)

In [4]:
pre

https:\/\/\S+(?:\.com|\.org)\S+

In [5]:
pre.get_matches(text)

['https://github.com/khuyentran1401']

# Example 2 : HTTP or HTTPS

In [6]:
from pregex.quantifiers import Optional

In [7]:
pre2 = (
    "http" +
    Optional("s") +
    "://" +
    OneOrMore(AnyButWhitespace()) +
    Either(".com", ".org") +
    OneOrMore(AnyButWhitespace())
)

In [8]:
pre2

https?:\/\/\S+(?:\.com|\.org)\S+

In [9]:
pre2.get_matches(text)

['https://github.com/khuyentran1401']

# Example 3 : Match URL without a Scheme(i.e. https://)

In [10]:
text3 = "You can find me through my website mathdatasimplified.com/ or GitHub https://github.com/khuyentran1401"

In [11]:
pre3 = (
    Optional("http" + Optional("s") + "://") +
    OneOrMore(AnyButWhitespace()) +
    Either(".com", ".org") +
    OneOrMore(AnyButWhitespace())
)

In [12]:
pre3

(?:https?:\/\/)?\S+(?:\.com|\.org)\S+

In [13]:
pre3.get_matches(text3)

['mathdatasimplified.com/', 'https://github.com/khuyentran1401']

# Example 4 : Capture Time

In [14]:
from pregex.classes import AnyDigit

In [15]:
text4 = "It is 6:00 pm now"

In [16]:
pre4 = AnyDigit() + ":" + OneOrMore(AnyDigit())

In [17]:
pre4

\d:\d+

In [18]:
pre4.get_matches(text4)

['6:00']

# Example 5 : Capture Phone Numbers

In [19]:
from pregex.classes import AnyFrom

In [20]:
text5 = "My phone number is 3452352312 or 345-235-2312 or 345 235 2312 or 345.235.2312"

In [21]:
punctuation = AnyFrom("-", " ", ".")

pre5 = (
    OneOrMore(AnyDigit()) +
    Optional(punctuation) +
    OneOrMore(AnyDigit()) +
    Optional(punctuation) +
    OneOrMore(AnyDigit())
)

In [22]:
pre5

\d+[\- .]?\d+[\- .]?\d+

In [23]:
pre5.get_matches(text5)

['3452352312', '345-235-2312', '345 235 2312', '345.235.2312']

# Example 6 : Capture an Email Address

In [24]:
text6 = "My email is abcd@gmail.com"

In [25]:
pre6 = (
    OneOrMore(AnyButWhitespace()) +
    "@" +
    OneOrMore(AnyButWhitespace()) +
    Either(".com", ".org", ".io", ".net")
)

In [26]:
pre6

\S+@\S+(?:\.com|\.org|\.io|\.net)

In [27]:
pre6.get_matches(text6)

['abcd@gmail.com']