Load documents from disk in 3 languages: russian, french, thai

In [None]:
import os
from dataclasses import dataclass
from pathlib import Path

@dataclass
class Document:
    id: int
    text: str
    remove_reason: str | None = None

def load_documents(path: Path) -> list[Document]:
    docs = []
    for i, path in enumerate(sorted(path.glob("*.txt"))):
        with open(path, "r") as f:
            docs.append(Document(i, f.read()))
    return docs

def save_documents(docs: list[Document], path: Path):
    os.makedirs(path, exist_ok=True)
    for doc in docs:
        with open(path / f"{doc.id:03d}.txt", "w") as f:
            f.write(doc.text)

In [None]:
LANGS = ["rus_Cyrl", "fra_Latn", "tha_Thai"]

In [None]:
docs_per_lang = {
    lang: load_documents(Path(f"ml-documents/{lang}")) for lang in LANGS
}
for lang, docs in docs_per_lang.items():
    print(f"Loaded {len(docs)} documents for {lang}")

Let's define the filtering functions

In [None]:
TERMINAL_PUNCTUATION = (
    "᪩",
    "？",
    "⁈",
    "𑩂",
    "．",
    "꩞",
    "𑅃",
    "﹗",
    "𑂾",
    "\u1b7d",
    "፧",
    "𑅂",
    "꡶",
    "꘎",
    "⁉",
    "࠾",
    "᪨",
    "𑊩",
    "𑱂",
    "᱿",
    "𖩮",
    "᥅",
    "\U00011f43",
    "\U00011f44",
    "﹒",
    "𑈹",
    "𑈸",
    "።",
    "܂",
    "؞",
    "꛳",
    "\U00010f88",
    "𑗍",
    "𐩖",
    "𑙂",
    "\u061d",
    "꩟",
    "᠉",
    "\u1b7e",
    "𑗗",
    "᰼",
    "𑻸",
    "؟",
    "𑪜",
    "꧉",
    "𑗉",
    "𐽙",
    "𖫵",
    "𖬷",
    "܀",
    "꓿",
    "᜵",
    "𑗏",
    "𑁇",
    "𑗓",
    "𑥄",
    "៖",
    "𑥆",
    "𑗑",
    "𑗒",
    "꯫",
    "۔",
    "𐩗",
    "\U00010f86",
    "꡷",
    "\u2e54",
    "｡",
    "៕",
    "߹",
    "⸮",
    ".",
    "𑇅",
    "࠹",
    "𛲟",
    "꫰",
    "꤯",
    "𐽗",
    "᭞",
    "𑜼",
    "፨",
    "𑃁",
    "꣏",
    "𑇟",
    "𖬸",
    "𑪛",
    "𑜾",
    "࠷",
    "𝪈",
    "?",
    "𑃀",
    "𑗃",
    "！",
    "։",
    "꣎",
    "॥",
    "𑗖",
    "᭛",
    "᠃",
    "!",
    "၊",
    "𖺘",
    "⁇",
    "𑗌",
    "𑑋",
    "𖭄",
    "᭟",
    "𑅁",
    "𑙁",
    "⸼",
    "꩝",
    "𑗋",
    "。",
    "꧈",
    "꫱",
    "𑜽",
    "𐽖",
    "𑂿",
    "᙮",
    "។",
    "꛷",
    "\U00010f89",
    "៚",
    "᥄",
    "𑗕",
    "𑗎",
    "᪪",
    "᭚",
    "࠽",
    "𑇞",
    "𑗊",
    "𐽘",
    "\u2e53",
    "𑗔",
    "𖩯",
    "𑇍",
    "𑻷",
    "𐽕",
    "𑩃",
    "।",
    "𑗂",
    "𑇆",
    "𑁈",
    "။",
    "᱾",
    "𑱁",
    "꘏",
    "܁",
    "᜶",
    "‼",
    "𑈻",
    "‽",
    "᪫",
    "﹖",
    "𑑌",
    "𑈼",
    "\U00010f87",
    "𑗐",
    "៙",
    "᰻",
)

In [None]:
!pip install pythainlp pyyaml

In [None]:
import numpy as np

def separate_words(lang: str, text: str) -> list[str]:
    """
        Separates string (text) in lang `lang` into multiple words
    :param lang: 
    :param text: 
    :return: 
    """
    if lang == "tha_Thai":
        from pythainlp.tokenize import word_tokenize as th_word_tokenize

        tokens = th_word_tokenize(text, keep_whitespace=False, engine="newmm-safe")
        return [el.strip() for el in tokens if len(el.strip()) > 0]
    return text.split()


def filter_docs(lang: str, docs: list[Document], filter_config: dict) -> tuple[list[Document], list[Document]]:
    """
        Filters `docs` (that are in `lang` language) using some heuristic filters. Thresholds are defined in `filter_config`.
        Returns (kept documents, removed documents)
    :param lang: 
    :param docs: 
    :param filter_config: 
    :return: 
    """
    filtered, removed = [], []
    for doc in docs:
        lines = doc.text.split("\n")
        lines = [line for line in lines if line.strip() != ""]
        words = separate_words(lang, doc.text)
        
        if len(lines) == 0:
            doc.remove_reason = "empty"
            removed.append(doc)
            continue
        
        avg_n_words = np.mean([len(w) for w in words])
        if avg_n_words <= filter_config["min_avg_word_length"]:
            doc.remove_reason = "min_avg_word_length"
            removed.append(doc)
            continue

        if avg_n_words >= filter_config["max_avg_word_length"]:
            doc.remove_reason = "max_avg_word_length"
            removed.append(doc)
            continue
            
        if sum(w in filter_config["stopwords"] for w in words) < 2:
            doc.remove_reason = "stopwords"
            removed.append(doc)
            continue
            
        ratio = sum(1 for line in lines if line.endswith(TERMINAL_PUNCTUATION)) / len(lines)
        if ratio <= filter_config["line_punct_thr"]:
            doc.remove_reason = "line_punct_thr"
            removed.append(doc)
            continue
        
        new_line = doc.text.count("\n")
        if new_line / len(words) >= filter_config["new_line_ratio"]:
            doc.remove_reason = "new_line_ratio"
            removed.append(doc)
            continue
        
        # all good
        filtered.append(doc)
    return filtered, removed

Load thresholds from config

In [None]:
import yaml

filter_configs = {}
for lang in LANGS:
    with open(f"filter-configs/{lang}.yml") as f:
        filter_configs[lang] = yaml.safe_load(f)

In [None]:
filter_configs

Actually filter

In [None]:
result = {}
for lang, docs in docs_per_lang.items():
    filter_config = filter_configs[lang]
    filtered, removed = filter_docs(lang, docs, filter_config)
    result[lang] = (filtered, removed)
    save_documents(filtered, Path(f"filter-results/{lang}/filtered"))
    save_documents(removed, Path(f"filter-results/{lang}/removed"))
    print(f"Filtered {len(docs)} {lang} documents. Kept: {len(filtered)} | Removed: {len(removed)}")

Some stats on the filtering

In [None]:
from collections import Counter

for lang in LANGS:
    reasons = Counter()
    for removed_doc in result[lang][1]:
        reasons[removed_doc.remove_reason] += 1
    print(lang, "remove reasons:", reasons)