In [None]:
import fitz
import json
import re
import unicodedata

# Special Dagbani character replacements
replacements = {
    "\u0002": "ɔ",
    "\u0003": "ŋ",
    "\u0004": "ɛ",
    "\u0005": "ɣ",
}

# To clean up the extracted text from PDF
def clean_pdf_text(raw):
    for unknown, known in replacements.items():
        raw = raw.replace(unknown, known)

    cleaned = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', raw)

    cleaned = unicodedata.normalize("NFC", cleaned)

    cleaned = cleaned.replace("-\n", "").replace("\n", " ")

    cleaned = re.sub(r"\(?\d+:\d+\)?|\d+", "", cleaned)

    cleaned = re.sub(r"\s{2,}", " ", cleaned)
    cleaned = cleaned.replace("', '", " ")

    return cleaned.strip()


english_stopwords = {
    "the", "and", "is", "in", "of", "to", "that", "for", "on", "with", "as",
    "bible", "watchtower", "awake", "jw.org", "jw", "org"
}

# To remove lines containing any English stopwords
def remove_english_words(lines):
    filtered = []
    for line in lines:
        words = set(re.findall(r"[a-zA-Z]+", line.lower()))
        if not (words & english_stopwords):
            filtered.append(line)
    return filtered

def final_clean(lines):
    cleaned = []
    for line in lines:
        # 1. Collapse space-separated letters: "b a i b u l" -> "baibul"
        line = re.sub(r"(?:\b\w\s)+\w\b", lambda m: m.group(0).replace(" ", ""), line)

        # 2. Remove stray symbols (˙ ’ , ? ! etc.) except Dagbani letters
        line = re.sub(r"[^a-zA-Zɔɛŋɣ\s]", "", line)

        # 3. Normalize multiple spaces
        line = re.sub(r"\s{2,}", " ", line).strip()

        # 4. Drop if line is empty
        if not line:
            continue

        # 5. Stopword filtering
        words = set(re.findall(r"[a-zA-Zɔɛŋɣ]+", line.lower()))
        if words & english_stopwords:
            continue

        cleaned.append(line)
    return cleaned


doc = fitz.open("/content/lffi_DGB.pdf")
all_text = " ".join(clean_pdf_text(page.get_text()) for page in doc)

cleaned_lines = [line.lower() for line in all_text.split(".") if line.strip()]

cleaned_lines = remove_english_words(cleaned_lines)

cleaned_lines = final_clean(cleaned_lines)

data = {"lang": "dagbani", "data": cleaned_lines}
with open("dagbani_corpus.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

