In [5]:
import re
from camel_tools.tokenizers.word import simple_word_tokenize

# def clean_corpus_line(line):
#     # Skip metadata and markup
#     if line.startswith("#META#") or line.startswith("# Page") or line.startswith("<"):
#         return ""
#     # Remove tildes and HTML tags
#     line = re.sub(r'[~<>/"]', ' ', line)
#     return line.strip()
def clean_line(line):
    # Remove leading/trailing spaces
    line = line.strip()

    # Skip metadata and page markers
    if line.startswith("#META#") or line.startswith("# Page") or line.startswith("######"):
        return ""

    # Remove lines that are just hashes, dots, or markup
    if re.match(r'^#+$', line):  # only hashes
        return ""
    if re.match(r'^[\.\s]+$', line):  # only dots/spaces
        return ""
    if line.startswith("<") and line.endswith(">"):  # pure html tags
        return ""

    # Remove <span> tags and similar
    line = re.sub(r'<.*?>', ' ', line)

    # Remove "~~" continuation markers
    line = line.replace("~~", " ")

    # Remove stray #
    line = line.replace("#", " ")

    return line.strip()


def normalize_arabic(text):
    text = re.sub(r'[\u0617-\u061A\u064B-\u0652]', '', text)  # tashkeel
    text = re.sub(r'ـ', '', text)  # tatweel
    text = re.sub(r'[إأآا]', 'ا', text)  # alif
    text = re.sub(r'ى', 'ي', text)  # ya
    text = re.sub(r'ة', 'ه', text)  # taa marbuta
    return text

def tokenize_arabic(text):
    text = normalize_arabic(text)
    tokens = simple_word_tokenize(text)
    return tokens

# Example usage``
raw_line = "الحمد لله الذي هدانا لهذا وما كنا لنهتدي لولا أن هدانا الله"
tokens = tokenize_arabic(raw_line)
print(tokens)


['الحمد', 'لله', 'الذي', 'هدانا', 'لهذا', 'وما', 'كنا', 'لنهتدي', 'لولا', 'ان', 'هدانا', 'الله']


In [None]:
file_path = "../fath_muin/0987ZaynDinMalibari.FathMucin.Shamela0011327-ara1.txt"
corpus = []
with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        cleaned = clean_line(line)
        if cleaned:
            normalized = normalize_arabic(cleaned)
            tokens = simple_word_tokenize(normalized)
            if tokens:
                corpus.append(tokens)

print("First 5 tokenized lines:")
for line in corpus[:5]:
    print(line)

KeyboardInterrupt: 