In [1]:
import regex

In [3]:
_TOKEN_PATTERN = regex.compile(
    r'[0-9\.]+|\p{Han}|\p{Hiragana}+|\p{Katakana}+|\p{Cyrillic}+|\p{Latin}+'
)

def count_word_multilingual(text: str):
    tokens = _TOKEN_PATTERN.findall(text)
    return tokens, len(tokens)


In [2]:
def count_word_multilingual(text: str) -> int:
    """
    Performs rough tokenization of the input text and counts the number of tokens.
    Supports mixed languages including English, Chinese (including Japanese Kanji), 
    Spanish, Portuguese, French, German, Russian, Norwegian, and Japanese.
    
    Rules:
    1) [0-9\\.]+ : Matches consecutive digits (including decimal points) as a single token.
    2) \\p{Han} : Matches a single Chinese character (including Kanji used in Japanese).
    3) \\p{Hiragana}+ : Matches consecutive Hiragana characters as a single token.
    4) \\p{Katakana}+ : Matches consecutive Katakana characters as a single token.
    5) \\p{Cyrillic}+ : Matches consecutive Cyrillic letters (Russian).
    6) \\p{Latin}+ : Matches consecutive Latin letters (including diacritics),
                     supporting English, Spanish, Portuguese, French, German, Norwegian, etc.
    
    Notes:
    - Each Chinese character (\\p{Han}) is treated as an individual token. 
      For example, "你好" => ["你", "好"].
    - Consecutive characters from other scripts (e.g., "hello") are treated as a single token.
    - This is a simplified example and does not handle other symbols, punctuation, 
      or complex numerical formats.
    - Requires the third-party module `regex` (pip install regex),
      because the built-in `re` module has incomplete support for Unicode properties \\p{...}.
    """
    pattern = (
        r'[0-9\.]+'        # Consecutive digits and decimal points
        r'|\p{Han}'        # Single Chinese character
        r'|\p{Hiragana}+'  # Consecutive Hiragana characters
        r'|\p{Katakana}+'  # Consecutive Katakana characters
        r'|\p{Cyrillic}+'  # Consecutive Cyrillic letters
        r'|\p{Latin}+'     # Consecutive Latin letters (including diacritics)
    )
    tokens = regex.findall(pattern, text)
    return tokens, len(tokens)

In [4]:
test_text = """
英语
medication: amiodarone, dosage: not mentioned, mode: not mentioned, frequency: not mentioned, duration: not mentioned, reason: not mentioned, list/narrative: list;\nmedication: atrovent, dosage: not mentioned, mode: not mentioned, frequency: not mentioned, duration: not mentioned, reason: not mentioned, list/narrative: list;
中文
Normalized terms: 糖尿病性低血糖症, 骨质疏松, 高血压, 冠状动脉粥样硬化性心脏病, 不稳定性心绞痛
entity: кашля, type: Finding;\nentity: сыпь, type: Adverse Drug Reaction;\nentity: сироп, type: Drugform;
西语
entity: metastásica, type: tumor morphology;\nentity: metástasis, type: tumor morphology;\nentity: metástasis, type: tumor morphology;\nentity: neoplasia, type: tumor morphology;\nentity: neoplasia, type: tumor morphology;\nentity: tumor, type: tumor morphology;\nentity: tumoración, type: tumor morphology;\nentity: tumoral, type: tumor morphology;\nentity: tumores, type: tumor morphology;\nentity: Neoplasia pulmonar T4N0M1b, type: tumor morphology;\nentity: Carcinoma indiferenciado, type: tumor morphology;
日语
entity: 時, type: time;
entity: 身長, type: item;
entity: 164.8, type: value;
entity: である, type: PN;
entity: 疼痛, type: state;
entity: あり, type: PN;
entity: 心窩部, type: body;
法语
evidence of genre: patient;\nevidence of origine: une anémie sévère associée à des moelenas évoluant depuis 5 mois dans un contexte de conservation de l'état général, sans fièvre;\nevidence of issue: décédé;
挪威语
entity_1: sønn, entity_2: Hun, relation: Related_to;\nentity_1: gjenlevende, entity_2: sønn, relation: Holder;\nentity_1: 30, entity_2: sønn, relation: Holder;
德语
entity: Hypervolaemie, type: diagnosis, status: none;\nentity: Wassereinlagerungen, type: diagnosis, status: none;\nentity: Chronische Niereninsuffizienz, type: diagnosis, status: none;\nentity: Chemosaturation der Leber, type: treatment, status: none;\nentity: ZVK/Shaldon-Implantation, type: treatment, status: none;
葡萄牙语
entity: antiepilépticos, type: Therapeutics;\nentity: oxigénio em alto débito, type: Therapeutics;\nentity: câmara hiperbárica, type: Therapeutics;\nentity: melhoria gradual, type: Evolution;\nentity: encefalopatia, type: Condition;\nentity: controlo, type: Evolution;\nentity: crises epilépticas, type: Condition;\nentity: A, type: Test;\nentity: 15 horas depois, type: DateTime;\nentity: restrição a a difusão compatível, type: Results;\n
俄语
entity: вирус герпеса, type: Drug Interaction;\nentity: вирус папилломы человека, type: Drug Interaction;\nentity: лишай, type: Drug Interaction;\nentity: ослабленный иммунитет, type: Drug Interaction;
"""

tokens, count = count_word_multilingual(test_text)
# print("Testing text：", test_text)
print("Token count：", count)

Token count： 376
