In [2]:
!pip install pymupdf


Collecting pymupdf
  Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.4


In [9]:
import fitz  # PyMuPDF
import re
import pandas as pd

# Функция транслитерации
def transliterate(text, reverse=True):
    mapping = {
        "ti": "ты", "li": "лы", "ni": "ны", "ɨ": "ы",
        "p": "п",  "p'": "пʼ",
        "t": "т",  "t'": "тʼ",
        "k": "к",  "k'": "кʼ",
        "q": "ӄ",  "q'": "ӄʼ",
        "č": "ч",  "č'": "чʼ",
        "f": "ф",  "s": "с",
        "x": "х",  "χ": "ӽ",
        "w": "в",  "z": "з",
        "j": "й",  "g": "г",
        "ɬ": "ԓ", "l": "л",
        "ʎ": "љ",  "r": "р",
        "m": "м",  "n": "н",
        "ɲ": "њ",  "ŋ": "ӈ",
        "ʔ": "ʔ",  "d": "д",
        "b" : "б",
        "ʂ":"ш",
        "ɕː":"щ",
        "ʐ":"ж",
        "t͡s":"ц",
        "i": "и", "u": "у", "e": "е", "o": "о", "a": "а", "ə": "ә", "e": "э",
        "'": "ь",  "ʷ": "˚", "ŭ": "ў", "ŏ": "ŏ", "ǎ": "ǎ",
    }
    if reverse:
        mapping = {v: k for k, v in mapping.items()}
    sorted_keys = sorted(mapping.keys(), key=len, reverse=True)
    result = text
    for src in sorted_keys:
        tgt = mapping[src]
        result = result.replace(src, tgt)
    return result

doc = fitz.open("/content/Володин и др._2021_Полный ительменско-русский словарь (1)-387-393.pdf")
text = "\n".join(page.get_text() for page in doc)

lines = re.findall(r"(?m)^(-\S+.*?)$", text)

affixes = []
current_affix = None
for line in lines:
    if re.match(r"^-\S+", line):
        if current_affix:
            affixes.append(current_affix)
        current_affix = line
    elif current_affix:
        current_affix += " " + line.strip()
if current_affix:
    affixes.append(current_affix)

data = []
for entry in affixes:
    match = re.match(r"^(-\S+(?:\s*\([^)]+\))?)\s+(.*)", entry)
    if match:
        affix, description = match.groups()
        data.append((affix.strip(), description.strip()))
    else:
        data.append((entry.strip(), ""))

df = pd.DataFrame(data, columns=["Аффикс", "Описание"])

df["Аффикс"] = df["Аффикс"].apply(transliterate)

def transliterate_affix_like_parts(text):
    words = text.split()
    result = []
    for word in words:
        if word.startswith("-"):
            result.append(transliterate(word))
        else:
            result.append(word)
    return " ".join(result)

df["Описание"] = df["Описание"].apply(transliterate_affix_like_parts)

df.to_csv("affixes_transliterated.csv", index=False, encoding="utf-8")
