In [1]:
import re
from typing import List


In [2]:
# 1) Dotted abbreviations like U.S.A., U.K., Ph.D.
ABBREV_DOTTED = r"(?:[A-Za-z]\.){2,}[A-Za-z]?"  # e.g., U.S.A. or U.S.A

# 2) All-caps acronyms (2+ letters), keep as one token (USA, NATO)
ACRONYM_ALLCAPS = r"(?:[A-Z]{2,})"

# 3) Internal hyphenations (ice-cream, mother-in-law)
HYPHENATED = r"(?:[A-Za-z]+(?:-[A-Za-z]+)+)"

# 4) Words (letters only), we’ll split contractions in post-processing
WORD = r"(?:[A-Za-z]+)"

# 5) Numbers (with optional commas/decimals) – kept as one token
NUMBER = r"(?:\d{1,3}(?:,\d{3})+(?:\.\d+)?|\d+\.\d+|\d+)"

# 6) Any single punctuation/special symbol (one char)
PUNCT_OR_SYMBOL = r"(?:[^\w\s])"  # anything not a letter/digit/_ or whitespace

# Master pattern: try the most specific first
TOKEN_PATTERN = re.compile(
    "|".join([
        ABBREV_DOTTED,
        ACRONYM_ALLCAPS,
        HYPHENATED,
        NUMBER,
        WORD,
        PUNCT_OR_SYMBOL
    ])
)


In [3]:
# Common apostrophe contractions to split if present: it's -> it + 's (we'll output: it, s)
APOSTROPHE_CONTRACTIONS = [
    r"(.*?)(n)['’]?t$",   # don't -> do + n't  (we'll output as do, nt)
    r"(.*?)[‘’']re$",     # you're -> you + re
    r"(.*?)[‘’']ve$",     # they've -> they + ve
    r"(.*?)[‘’']ll$",     # I'll -> I + ll
    r"(.*?)[‘’']d$",      # I'd -> I + d  (would/had)
    r"(.*?)[‘’']m$",      # I'm -> I + m
    r"(.*?)[‘’']s$",      # it's -> it + s  (is/has or possessive; we keep it uniform)
]
APOSTROPHE_CONTRACTIONS = [re.compile(p, re.IGNORECASE) for p in APOSTROPHE_CONTRACTIONS]

# No-apostrophe nt-case (e.g., isnt -> is + nt, dont -> do + nt)
# Restrict to typical bases to avoid false splits like "paint"
NT_BASES = {
    "is","are","am","do","does","did","have","has","had",
    "can","could","will","would","should","shall","must",
    "might","may","was","were","ain","won","don"
}


In [4]:
def split_contraction(token: str) -> List[str]:
    """
    Split contractions into two tokens.
    - Apostrophe forms: you're -> you + re; isn't -> is + nt
    - No-apostrophe 'nt' forms: isnt -> is + nt (for known bases)
    Returns [token] unchanged if no split applies.
    """
    t = token

    # 4a) Apostrophe-based splits
    for pat in APOSTROPHE_CONTRACTIONS:
        m = pat.match(t)
        if m:
            root = m.group(1)
            # Special case for n't: pattern captures n separately
            if len(m.groups()) >= 2 and m.group(2) == 'n':
                return [root, "nt"]
            # For other endings, take trailing part from the regex pattern itself
            suffix = pat.pattern.split("]")[-1]  # crude, we'll compute directly instead
            # Better: derive suffix by removing the root
            # Redo simply: check explicit endings
            lower = t.lower()
            for end in ("'re","’re","re","'ve","’ve","ve","'ll","’ll","ll","'d","’d","d","'m","’m","m","'s","’s","s"):
                if lower.endswith(end):
                    return [root, end.replace("'", "").replace("’","")]
            # Fallback: no split
            return [token]

    # 4b) No-apostrophe "nt" (isnt, dont, cant)
    lower = t.lower()
    if lower.endswith("nt"):
        stem = lower[:-2]
        if stem in NT_BASES:
            # Respect original casing for the stem if possible
            return [t[:-2], "nt"]

    return [token]


In [5]:
def tokenize(text: str) -> List[str]:
    """
    1) Use regex to extract preliminary tokens.
    2) Post-process to split contractions.
    3) Keep punctuation/symbols as standalone tokens.
    4) Keep dotted abbreviations, acronyms, and hyphenations as single tokens.
    """
    raw_tokens = [m.group(0) for m in TOKEN_PATTERN.finditer(text)]

    tokens: List[str] = []
    for tok in raw_tokens:
        # If it's a pure word, attempt contraction splitting.
        # Words = letters only (matches our WORD). Others we leave alone.
        if re.fullmatch(WORD, tok):
            tokens.extend(split_contraction(tok))
        else:
            tokens.append(tok)

    return tokens


In [6]:
if __name__ == "__main__":
    sample = (
        "He said: U.S.A. and USA are both abbreviations. "
        "I love ice-cream! But he doesnt; she isn't; you’re; I'd; it’s fine. "
        "Numbers like 1,234.56 and 42 are tokens. "
        "Contraction without apostrophe: isnt, dont, cant."
    )
    print(tokenize(sample))


['He', 'said', ':', 'U.S.A.', 'and', 'USA', 'are', 'both', 'abbreviations', '.', 'I', 'love', 'ice-cream', '!', 'But', 'he', 'does', 'nt', ';', 'she', 'isn', "'", 't', ';', 'you', '’', 're', ';', 'I', "'", 'd', ';', 'it', '’', 's', 'fine', '.', 'Numbers', 'like', '1,234.56', 'and', '42', 'are', 'tokens', '.', 'Contraction', 'without', 'apostrophe', ':', 'is', 'nt', ',', 'do', 'nt', ',', 'ca', 'nt', '.']
