In [1]:
import re

text = """
Check out https://openai.com or http://example.com.
Contact me at test.email@gmail.com or another_email123@domain.org.
Prices are 45, 99.99, and 1000 dollars.
Isn't this a good-tokenizer?
#NLP and @OpenAI are popular hashtags/mentions.
"""

#  URLs (http or https)
urls = re.findall(r"https?://[^\s]+", text)
print("URLs:", urls)
# Output: ['https://openai.com', 'http://example.com']

#  Emails
emails = re.findall(r"\b[\w.-]+@[\w.-]+\.\w+\b", text)
print("Emails:", emails)
# Output: ['test.email@gmail.com', 'another_email123@domain.org']

# Numbers (integers and decimals)
numbers = re.findall(r"\d+(?:\.\d+)?", text)
print("Numbers:", numbers)
# Output: ['45', '99.99', '1000']

# Tokenizing words with hyphens/apostrophes
tokens = re.findall(r"\w[\w'-]*", text)
print("Tokens:", tokens)
# Output: ['Check', 'out', 'https', 'openai', 'com', 'or', 'http', 'example', 'com', 'Contact', ...]

# Hashtags and mentions
hashtags = re.findall(r"#\w+", text)
mentions = re.findall(r"@\w+", text)
print("Hashtags:", hashtags)  # ['#NLP']
print("Mentions:", mentions)  # ['@OpenAI']

#  Removing punctuation (keep words and numbers)
cleaned = re.sub(r"[^\w\s]", "", text)
print("Cleaned text:", cleaned)
# Output: 'Check out httpsopenaicom or httpexamplecom Contact me at testemailgmailcom ...'

#  Non-capturing groups example
prices = re.findall(r"\d+(?:\.\d+)?\s*dollars?", text)
print("Prices:", prices)
# Output: ['45 dollars', '99.99 dollars', '1000 dollars']

# Lookahead (words followed by 'ing')
verbs_ing = re.findall(r"\b\w+(?=ing\b)", "I am running and jumping")
print("Words before 'ing':", verbs_ing)
# Output: ['runn', 'jump']

# ummary of tricks:
# - https?   → matches http or https
# - (?:...)  → non-capturing group
# - \d+      → digits
# - \w[\w'-]* → word tokenizer with hyphens/apostrophes
# - [^\w\s]  → remove punctuation
# - \b       → word boundary
# - Lookahead/lookbehind → context-specific matching


URLs: ['https://openai.com', 'http://example.com.']
Emails: ['test.email@gmail.com', 'another_email123@domain.org']
Numbers: ['123', '45', '99.99', '1000']
Tokens: ['Check', 'out', 'https', 'openai', 'com', 'or', 'http', 'example', 'com', 'Contact', 'me', 'at', 'test', 'email', 'gmail', 'com', 'or', 'another_email123', 'domain', 'org', 'Prices', 'are', '45', '99', '99', 'and', '1000', 'dollars', "Isn't", 'this', 'a', 'good-tokenizer', 'NLP', 'and', 'OpenAI', 'are', 'popular', 'hashtags', 'mentions']
Hashtags: ['#NLP']
Mentions: ['@gmail', '@domain', '@OpenAI']
Cleaned text: 
Check out httpsopenaicom or httpexamplecom
Contact me at testemailgmailcom or another_email123domainorg
Prices are 45 9999 and 1000 dollars
Isnt this a goodtokenizer
NLP and OpenAI are popular hashtagsmentions

Prices: ['1000 dollars']
Words before 'ing': ['runn', 'jump']
