<a href="https://colab.research.google.com/github/lcontrerasroa/macedonian/blob/main/notebooks/EAF2MAUS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
# @title Fetch both original and _new .eaf files from GitHub
import os
from pathlib import Path
import shutil

REPO_URL = "https://github.com/lcontrerasroa/macedonian.git"
REPO_DIR = Path("/content/macedonian")
EAF_NEW = Path("/content/eaf_new")
EAF_ORIG = Path("/content/eaf_orig")

# reset optionnel : si tu veux tout nettoyer avant de relancer
for d in [REPO_DIR, EAF_NEW, EAF_ORIG]:
    if d.exists():
        shutil.rmtree(d)

# clone du dépôt complet (profondeur minimale)
os.system(f"git clone --depth 1 {REPO_URL} {REPO_DIR}")

# création des dossiers cibles
EAF_NEW.mkdir(parents=True, exist_ok=True)
EAF_ORIG.mkdir(parents=True, exist_ok=True)

# copie des fichiers depuis le dépôt GitHub
os.system(f"cp {REPO_DIR}/data/eaf_new/*.eaf {EAF_NEW}/ 2>/dev/null || true")
os.system(f"cp {REPO_DIR}/data/eaf_orig/*.eaf {EAF_ORIG}/ 2>/dev/null || true")

# vérification
print("✅ Copied the following:")
os.system("ls -1 /content/eaf_new | head -n 5 && echo '... (eaf_new)'")
os.system("ls -1 /content/eaf_orig | head -n 5 && echo '... (eaf_orig)'")


✅ Copied the following:


0

In [3]:
# @title Inspect the internal tier structure of all EAF files (new + orig)
!pip -q install pympi-ling

from pathlib import Path
import pympi
import re

# === Paths ===
roots = [Path("/content/eaf_new"), Path("/content/eaf_orig")]
for r in roots:
    assert r.exists(), f"The folder {r} doesn't exist. Run the GitHub import first."

# === Function to summarize one EAF ===
def summarize_eaf(path):
    eaf = pympi.Elan.Eaf(str(path))
    tiers = eaf.get_tier_names()
    info = []
    for t in tiers:
        anns = eaf.get_annotation_data_for_tier(t)
        non_empty = [a for a in anns if a[2] and str(a[2]).strip()]
        info.append({
            "tier": t,
            "annotations": len(anns),
            "non_empty": len(non_empty)
        })
    return info

# === Loop over both directories ===
results = []
for base in roots:
    for eaf_file in sorted(base.glob("*.eaf")):
        try:
            tiers = summarize_eaf(eaf_file)
            results.append((f"{base.name}/{eaf_file.name}", tiers))
        except Exception as e:
            results.append((f"{base.name}/{eaf_file.name}", f"ERROR: {e}"))

# === Display ===
for name, tiers in results:
    print(f"\n=== {name} ===")
    if isinstance(tiers, str):
        print("  ", tiers)
        continue
    for t in tiers:
        tname = t["tier"]
        nn = t["non_empty"]
        tot = t["annotations"]
        pct = (nn / tot * 100) if tot else 0
        print(f"  - {tname:30s} {nn:4d}/{tot:<4d} ({pct:5.1f}%) non-empty")



=== Macedonian_decata_so_zlatna_kosa_new.eaf ===
  - ref@SP1                         257/257  (100.0%) non-empty
  - rp@SP1                          109/109  (100.0%) non-empty
  - comm                              2/2    (100.0%) non-empty
  - qt@SP1                           61/61   (100.0%) non-empty
  - ft@SP1                          257/257  (100.0%) non-empty
  - lit@SP1                           0/0    (  0.0%) non-empty
  - tx@SP1                          257/257  (100.0%) non-empty
  - not@SP1                           4/4    (100.0%) non-empty
  - typ@SP1                          67/67   (100.0%) non-empty
  - mot@SP1                        1014/1014 (100.0%) non-empty
  - wps@SP1                         998/1014 ( 98.4%) non-empty
  - mb@SP1                         1319/1319 (100.0%) non-empty
  - ge@SP1                         1319/1319 (100.0%) non-empty
  - ps@SP1                         1319/1319 (100.0%) non-empty
  - par@SP1                          71/71   (100.0%) 

In [4]:
# @title Extract Cyrillic TXT per speaker from tx_cyr@SPx (fallback to tx@SPx) — robust tuples
!pip -q install pympi-ling

import re
from pathlib import Path
import pympi
import shutil

SRC_DIR = Path("/content/eaf_orig")          # ← les .eaf ORIGINAUX
OUT_DIR = Path("/content/out_txt_cyr")       # txt cyrilliques

# Reset propre de la sortie
if OUT_DIR.exists():
    shutil.rmtree(OUT_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)

tier_re_tx_cyr = re.compile(r"^tx_cyr@SP(\d+)$", re.I)
tier_re_tx     = re.compile(r"^tx@SP(\d+)$", re.I)

def get_speaker_tiers(eaf):
    names = eaf.get_tier_names()
    cyr = [(t, int(m.group(1))) for t in names if (m:=tier_re_tx_cyr.match(t))]
    lat = [(t, int(m.group(1))) for t in names if (m:=tier_re_tx.match(t))]
    # index par SP: préfère tx_cyr, sinon tx
    speakers = {}
    for t, sp in lat:
        speakers.setdefault(sp, (f"tx@SP{sp}", t))
    for t, sp in cyr:
        speakers[sp] = (f"tx_cyr@SP{sp}", t)
    chosen = []
    for sp, pair in sorted(speakers.items()):
        tag, tname = pair
        chosen.append((tname, sp, tag.startswith("tx_cyr")))
    return chosen

def _normalize_ann_tuple(item):
    """
    Accepte (start, end, value) ou (ann_id, start, end, value).
    Retourne (start, end, value) avec value en str.
    """
    if not isinstance(item, (list, tuple)):
        return None
    if len(item) == 3:
        s, e, v = item
    elif len(item) >= 4:
        s, e, v = item[-3], item[-2], item[-1]
    else:
        return None
    return s, e, "" if v is None else str(v)

def concat_intervals(eaf, tiername):
    raw = eaf.get_annotation_data_for_tier(tiername)
    norm = []
    for it in raw:
        tup = _normalize_ann_tuple(it)
        if tup is None:
            continue
        s, e, v = tup
        v = v.strip()
        if v:
            norm.append((s, e, v))
    # tri temporel quand possible, sinon laisse l'ordre d'origine
    def _key(x):
        s, e, _ = x
        try:
            return (int(s), int(e))
        except Exception:
            return (float("inf"), float("inf"))
    norm.sort(key=_key)
    return "".join(v + " " for _,__,v in norm).strip()

count_files = 0
for eaf_path in sorted(SRC_DIR.glob("*.eaf")):
    eaf = pympi.Elan.Eaf(str(eaf_path))
    for tiername, sp, is_cyr in get_speaker_tiers(eaf):
        txt = concat_intervals(eaf, tiername)
        if not txt:
            continue
        stem = eaf_path.stem
        # on n'exporte que le cyrillique si présent; sinon le latin prendra la place cyr (comme fallback)
        out = OUT_DIR / f"{stem}_SP{sp}_cyr.txt"
        out.write_text(txt + "\n", encoding="utf-8")
        count_files += 1
        print(f"Wrote {out.name} ({len(txt)} chars)")
print(f"\n✅ Done. Wrote {count_files} TXT file(s) to {OUT_DIR}")


Wrote Macedonian_decata_so_zlatna_kosa_new_SP1_cyr.txt (9508 chars)
Wrote Macedonian_duhot_od_grobot_new_SP1_cyr.txt (6075 chars)
Wrote Macedonian_dva_braka_new_SP1_cyr.txt (1703 chars)
Wrote Macedonian_dva_braka_new_SP2_cyr.txt (22 chars)
Wrote Macedonian_itar_pejo_new_SP1_cyr.txt (1079 chars)
Wrote Macedonian_itar_pejo_new_SP2_cyr.txt (47 chars)
Wrote Macedonian_kralevic_marko_new_SP1_cyr.txt (869 chars)
Wrote Macedonian_kralevic_marko_new_SP2_cyr.txt (28 chars)
Wrote Macedonian_kusa_new_SP1_cyr.txt (1595 chars)
Wrote Macedonian_kusa_new_SP2_cyr.txt (56 chars)
Wrote Macedonian_makata_new_SP1_cyr.txt (839 chars)
Wrote Macedonian_masha_new_SP1_cyr.txt (2120 chars)
Wrote Macedonian_masha_new_SP2_cyr.txt (104 chars)
Wrote Macedonian_mrzlivata_zena_new_SP1_cyr.txt (6269 chars)
Wrote Macedonian_mrzlivata_zena_new_SP2_cyr.txt (59 chars)
Wrote Macedonian_najdenko_new_SP1_cyr.txt (4415 chars)
Wrote Macedonian_narecnicite_new_SP1_cyr.txt (1299 chars)
Wrote Macedonian_ovcarot_new_SP1_cyr.txt (1

In [5]:
# @title Inspect each EAF: tier overview with sample content
!pip -q install pympi-ling pandas

import pandas as pd
from pathlib import Path
import pympi

SRC_DIR = Path("/content/eaf_orig")  # adapte si besoin
OUT_DIR = Path("/content/eaf_tier_summary")
OUT_DIR.mkdir(exist_ok=True)

def sample_value(annotations):
    for a in annotations:
        if isinstance(a, (list, tuple)):
            val = a[-1] if a else ""
        else:
            val = a
        if isinstance(val, str) and val.strip():
            txt = val.strip().replace("\n", " ")
            return (txt[:80] + "…") if len(txt) > 80 else txt
    return ""

def summarize_eaf_tiers(eaf_path):
    eaf = pympi.Elan.Eaf(str(eaf_path))
    data = []
    for tier in eaf.get_tier_names():
        anns = eaf.get_annotation_data_for_tier(tier)
        non_empty = [a for a in anns if (isinstance(a, (list, tuple)) and len(a) > 0 and str(a[-1]).strip())]
        sample = sample_value(non_empty)
        data.append({
            "tier": tier,
            "annotations": len(anns),
            "non_empty": len(non_empty),
            "sample": sample
        })
    df = pd.DataFrame(data)
    df = df.sort_values("tier")
    return df

summaries = {}
for eaf_path in sorted(SRC_DIR.glob("*.eaf")):
    df = summarize_eaf_tiers(eaf_path)
    summaries[eaf_path.name] = df
    out_csv = OUT_DIR / f"{eaf_path.stem}_tiers.csv"
    df.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"✅ {eaf_path.name}: {len(df)} tiers → {out_csv.name}")

print(f"\nAll CSV summaries saved in: {OUT_DIR}")


✅ Macedonian_decata_so_zlatna_kosa_new.eaf: 16 tiers → Macedonian_decata_so_zlatna_kosa_new_tiers.csv
✅ Macedonian_duhot_od_grobot_new.eaf: 16 tiers → Macedonian_duhot_od_grobot_new_tiers.csv
✅ Macedonian_dva_braka_new.eaf: 32 tiers → Macedonian_dva_braka_new_tiers.csv
✅ Macedonian_itar_pejo_new.eaf: 32 tiers → Macedonian_itar_pejo_new_tiers.csv
✅ Macedonian_kralevic_marko_new.eaf: 32 tiers → Macedonian_kralevic_marko_new_tiers.csv
✅ Macedonian_kusa_new.eaf: 32 tiers → Macedonian_kusa_new_tiers.csv
✅ Macedonian_makata_new.eaf: 16 tiers → Macedonian_makata_new_tiers.csv
✅ Macedonian_masha_new.eaf: 32 tiers → Macedonian_masha_new_tiers.csv
✅ Macedonian_mrzlivata_zena_new.eaf: 32 tiers → Macedonian_mrzlivata_zena_new_tiers.csv
✅ Macedonian_najdenko_new.eaf: 16 tiers → Macedonian_najdenko_new_tiers.csv
✅ Macedonian_narecnicite_new.eaf: 16 tiers → Macedonian_narecnicite_new_tiers.csv
✅ Macedonian_ovcarot_new.eaf: 16 tiers → Macedonian_ovcarot_new_tiers.csv
✅ Macedonian_ovenot_i_kozata_new.e

In [6]:
# @title Inspect EAF tier hierarchy (works with tuple-structured tiers)
!pip -q install pympi-ling
from pathlib import Path
import pympi

path = Path("/content/eaf_orig/Macedonian_decata_so_zlatna_kosa.eaf")
eaf = pympi.Elan.Eaf(str(path))

print(f"File: {path.name}\n")
print(f"{'Tier':35s} {'Parent':20s} {'Children'}")
print("-" * 80)

# extraction prudente de PARENT_REF
def safe_parent(info):
    if isinstance(info, dict):
        return info.get("PARENT_REF")
    if isinstance(info, (list, tuple)):
        for item in info:
            if isinstance(item, dict) and "PARENT_REF" in item:
                return item["PARENT_REF"]
    return None

# construire map parent → enfants
child_map = {}
parent_map = {}
for t, info in eaf.tiers.items():
    parent = safe_parent(info)
    if parent:
        parent_map[t] = parent
        child_map.setdefault(parent, []).append(t)

# affichage hiérarchique
for t in eaf.get_tier_names():
    parent = parent_map.get(t, "-")
    children = ", ".join(child_map.get(t, [])) or "-"
    print(f"{t:35s} {parent:20s} {children}")


File: Macedonian_decata_so_zlatna_kosa_new.eaf

Tier                                Parent               Children
--------------------------------------------------------------------------------
ref@SP1                             -                    ft@SP1, lit@SP1, tx@SP1, not@SP1, tx_cyr@SP1
rp@SP1                              -                    typ@SP1
comm                                -                    -
qt@SP1                              -                    -
ft@SP1                              ref@SP1              -
lit@SP1                             ref@SP1              -
tx@SP1                              ref@SP1              mot@SP1
not@SP1                             ref@SP1              -
typ@SP1                             rp@SP1               -
mot@SP1                             tx@SP1               wps@SP1, mb@SP1
wps@SP1                             mot@SP1              -
mb@SP1                              mot@SP1              ge@SP1, ps@SP1
ge@SP1         

In [7]:
# @title Debug tier names in one EAF
from pathlib import Path
import pympi

eaf_path = Path("/content/eaf_new/Macedonian_decata_so_zlatna_kosa_new.eaf")
eaf = pympi.Elan.Eaf(str(eaf_path))
print("Tier names in this file:\n")
for t in eaf.get_tier_names():
    anns = eaf.get_annotation_data_for_tier(t)
    non_empty = [a for a in anns if isinstance(a, (list, tuple)) and str(a[-1]).strip()]
    sample = (non_empty[0][-1] if non_empty else "")[:60]
    print(f"{t:30s} | annots={len(anns):4d} | non_empty={len(non_empty):4d} | sample={sample}")


Tier names in this file:

ref@SP1                        | annots= 257 | non_empty= 257 | sample=Macedonian_decata_so_zlatna_kosa.001
rp@SP1                         | annots= 109 | non_empty= 109 | sample=DR_Event
comm                           | annots=   2 | non_empty=   2 | sample=insubordinate hortative
qt@SP1                         | annots=  61 | non_empty=  61 | sample=DR_Event_Discourse_Report
ft@SP1                         | annots= 257 | non_empty= 257 | sample=Macedonian_decata_so_zlatna_kosa.001
lit@SP1                        | annots=   0 | non_empty=   0 | sample=
tx@SP1                         | annots= 257 | non_empty= 257 | sample=Macedonian_decata_so_zlatna_kosa.001
not@SP1                        | annots=   4 | non_empty=   4 | sample=Macedonian_decata_so_zlatna_kosa.038
typ@SP1                        | annots=  67 | non_empty=  67 | sample=Discourse_Report
mot@SP1                        | annots=1014 | non_empty=1014 | sample=Macedonian_decata_so_zlatna_kosa.001
wp

In [9]:
# @title Analyze single eaf tier content

from pathlib import Path
import xml.etree.ElementTree as ET

sample = Path("/content/eaf_orig/Macedonian_dva_braka.eaf")
print(f"Inspecting {sample.name}")

tree = ET.parse(sample)
root = tree.getroot()

# findall sans namespace
tiers = root.findall(".//TIER")

print(f"Found {len(tiers)} tiers\n")

for tier in tiers:
    tid = tier.attrib.get("TIER_ID", "")
    ling = tier.attrib.get("LINGUISTIC_TYPE_REF", "")
    vals = [v.text.strip() for v in tier.findall(".//ANNOTATION_VALUE") if v.text and v.text.strip()]
    preview = " | ".join(vals[:3])
    print(f"{tid:25s} ({ling:10s})  →  {len(vals)} vals  |  {preview}")


Inspecting Macedonian_dva_braka.eaf


FileNotFoundError: [Errno 2] No such file or directory: '/content/eaf_orig/Macedonian_dva_braka.eaf'

In [None]:
# @title Extract only tx@SPx and tx_cyr@SPx text (in order of appearance)
# This will extract each speaker in a separate tier
# which might be problematic for WEBMaus alignment.

from pathlib import Path
import xml.etree.ElementTree as ET
import re
import shutil

SRC_DIR = Path("/content/eaf_orig")  # dossier des fichiers originaux
OUT_DIR = Path("/content/out_txt_cyr")

# ⚠️ Vider le dossier de sortie avant d'écrire les nouveaux fichiers
if OUT_DIR.exists():
    shutil.rmtree(OUT_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)

tier_pattern = re.compile(r"^(tx(_cyr)?@SP\d+)$", re.I)

written = 0

for eaf_file in sorted(SRC_DIR.glob("*.eaf")):
    root = ET.parse(eaf_file).getroot()
    for tier in root.findall(".//TIER"):
        tid = tier.attrib.get("TIER_ID", "")
        if not tier_pattern.match(tid):
            continue
        vals = [v.text.strip() for v in tier.findall(".//ANNOTATION_VALUE") if v.text and v.text.strip()]
        if not vals:
            continue
        txt = " ".join(vals)
        suffix = "cyr" if "tx_cyr" in tid.lower() else "lat"
        out = OUT_DIR / f"{eaf_file.stem}_{tid}_{suffix}.txt"
        out.write_text(txt + "\n", encoding="utf-8")
        written += 1
        print(f"Wrote {out.name} ({len(txt)} chars, {len(vals)} segments)")

print(f"\n✅ Done. Wrote {written} files in {OUT_DIR}")


In [None]:
# @title Merge speakers into one transcript by time (tx_cyr/tx)
from pathlib import Path
import xml.etree.ElementTree as ET
import re
import shutil

SRC_DIR = Path("/content/eaf_orig")  # tes EAF "originaux"
OUT_DIR = Path("/content/out_txt_merged")

# reset sortie
if OUT_DIR.exists():
    shutil.rmtree(OUT_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)

def parse_ns(root):
    return {"elan": root.tag.split("}")[0].strip("{")}

def build_time_maps(root):
    ns = parse_ns(root)
    # TIME_SLOT_ID -> ms
    ts = {}
    for node in root.findall(".//elan:TIME_ORDER/elan:TIME_SLOT", ns):
        tid = node.attrib.get("TIME_SLOT_ID")
        val = node.attrib.get("TIME_VALUE")
        if tid and val is not None:
            try:
                ts[tid] = int(val)
            except:
                pass
    # ALIGNABLE_ANNOTATION_ID -> (start_ms, end_ms)
    align = {}
    for aln in root.findall(".//elan:TIER/elan:ANNOTATION/elan:ALIGNABLE_ANNOTATION", ns):
        aid = aln.attrib.get("ANNOTATION_ID")
        r1  = aln.attrib.get("TIME_SLOT_REF1")
        r2  = aln.attrib.get("TIME_SLOT_REF2")
        if aid and r1 in ts and r2 in ts:
            align[aid] = (ts[r1], ts[r2])
    return ts, align

def collect_segments(root):
    ns = parse_ns(root)
    segs = []  # list of dicts: {tier_id, sp, kind, start, end, text, order}
    order_counter = 0
    for tier in root.findall(".//elan:TIER", ns):
        tid = tier.attrib.get("TIER_ID","")
        ltr = tier.attrib.get("LINGUISTIC_TYPE_REF","")
        if not re.match(r"^tx(_cyr)?@SP\d+$", tid, re.I) and ltr not in ("tx","tx_cyr"):
            continue
        # speaker + type
        msp = re.search(r"@SP(\d+)", tid, re.I)
        sp = f"SP{msp.group(1)}" if msp else "SP?"
        kind = "cyr" if ("tx_cyr" in tid.lower() or ltr == "tx_cyr") else "lat"
        # try REF_ANNOTATION path first
        for ref in tier.findall("./elan:ANNOTATION/elan:REF_ANNOTATION", ns):
            ref_id = ref.attrib.get("ANNOTATION_REF")
            val_el = ref.find("./elan:ANNOTATION_VALUE", ns)
            text = (val_el.text or "").strip() if val_el is not None else ""
            if text:
                segs.append({"tier_id": tid, "sp": sp, "kind": kind,
                             "ref_id": ref_id, "start": None, "end": None,
                             "text": text, "order": order_counter})
                order_counter += 1
        # fallback: ALIGNABLE_ANNOTATION with inline text
        for aln in tier.findall("./elan:ANNOTATION/elan:ALIGNABLE_ANNOTATION", ns):
            val_el = aln.find("./elan:ANNOTATION_VALUE", ns)
            text = (val_el.text or "").strip() if val_el is not None else ""
            if text:
                aid = aln.attrib.get("ANNOTATION_ID")
                segs.append({"tier_id": tid, "sp": sp, "kind": kind,
                             "ref_id": aid, "start": None, "end": None,
                             "text": text, "order": order_counter})
                order_counter += 1
    return segs

def attach_times(segs, align_map):
    for s in segs:
        aid = s.get("ref_id")
        if aid in align_map:
            s["start"], s["end"] = align_map[aid]
    return segs

def sort_key(s):
    # tri par start_ms, puis fallback sur ordre d'apparition
    start = s["start"]
    return (float("inf") if start is None else start, s["order"])

written = 0
for eaf in sorted(SRC_DIR.glob("*.eaf")):
    try:
        root = ET.parse(eaf).getroot()
    except Exception as ex:
        print(f"❌ Parse error {eaf.name}: {ex}")
        continue

    _, align = build_time_maps(root)
    segs = collect_segments(root)
    if not segs:
        continue
    segs = attach_times(segs, align)
    segs.sort(key=sort_key)

    # séparer cyr et lat
    segs_cyr = [s for s in segs if s["kind"] == "cyr"]
    segs_lat = [s for s in segs if s["kind"] == "lat"]

    # exporter versions "WebMAUS-safe" (sans tags)
    if segs_cyr:
        txt_cyr = " ".join(s["text"] for s in segs_cyr).strip()
        (OUT_DIR / f"{eaf.stem}_merged_cyr.txt").write_text(txt_cyr + "\n", encoding="utf-8")
        written += 1
    if segs_lat:
        txt_lat = " ".join(s["text"] for s in segs_lat).strip()
        (OUT_DIR / f"{eaf.stem}_merged_lat.txt").write_text(txt_lat + "\n", encoding="utf-8")
        written += 1

    # exporter une version lisible avec locuteur pour contrôle humain
    pretty = []
    for s in segs:
        t0 = "" if s["start"] is None else f"{s['start']:>7d}"
        t1 = "" if s["end"]   is None else f"{s['end']:>7d}"
        pretty.append(f"[{s['sp']}] {s['text']}")
    (OUT_DIR / f"{eaf.stem}_merged_with_spk.txt").write_text("\n".join(pretty) + "\n", encoding="utf-8")

    # segments TSV pour audit
    lines = ["start_ms\tend_ms\tsp\tkind\ttext"]
    for s in segs:
        lines.append(f"{'' if s['start'] is None else s['start']}\t{'' if s['end'] is None else s['end']}\t{s['sp']}\t{s['kind']}\t{s['text'].replace(chr(9),' ')}")
    (OUT_DIR / f"{eaf.stem}_segments.tsv").write_text("\n".join(lines) + "\n", encoding="utf-8")

print(f"✅ Merged outputs written to: {OUT_DIR} (files: {written})")


In [None]:
# @title Convert Cyrillic TXT → Spanish-like proxy (mk→es)
import re, unicodedata
from pathlib import Path

IN_DIR  = Path("/content/out_txt_cyr")
OUT_DIR = Path("/content/out_txt_proxy")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- cyr → latin de base (macédonien) ---
MK_CYR_TO_LAT = {
    "Ќ":"ḱ","ќ":"ḱ","Ѓ":"ǵ","ѓ":"ǵ","Ж":"ž","ж":"ž","З":"z","з":"z","Ѕ":"dz","ѕ":"dz",
    "Ч":"č","ч":"č","Џ":"dž","џ":"dž","Ш":"š","ш":"š","Ј":"j","ј":"j","Љ":"lj","љ":"lj",
    "Њ":"nj","њ":"nj","А":"a","а":"a","Б":"b","б":"b","В":"v","в":"v","Г":"g","г":"g",
    "Д":"d","д":"d","Е":"e","е":"e","И":"i","и":"i","К":"k","к":"k","Л":"l","л":"l",
    "М":"m","м":"m","Н":"n","н":"n","О":"o","о":"o","П":"p","п":"p","Р":"r","р":"r",
    "С":"s","с":"s","Т":"t","т":"t","У":"u","у":"u","Ф":"f","ф":"f","Х":"h","х":"h"
}
def mk_cyr_to_basic_latin(text:str)->str:
    return "".join(MK_CYR_TO_LAT.get(ch, ch) for ch in text)

# --- règles proxy espagnol (avec z→s et dz→ds comme demandé) ---
VOWELS = "aeiouáéíóúüAEIOUÁÉÍÓÚÜ"
def normalize_ws(t): return re.sub(r"\s+", " ", t).strip()
def strip_punct(t):
    t = re.sub(r"[„“”«»\"“”]", "", t)
    t = re.sub(r"[—–…]", " ", t)
    t = re.sub(r"[.,;:!?()\[\]{}/\\]", " ", t)
    return normalize_ws(t)

def apply_spanish_proxy_rules(text:str)->str:
    t = text
    # palatales
    t = t.replace("ḱ", "ky")
    t = t.replace("ǵ", "y")
    # affriquées et sibilantes
    t = t.replace("dž", "y")     # /dʒ/
    t = t.replace("dz", "ds")    # <- ta contrainte
    t = t.replace("č", "ch")
    t = t.replace("š", "s")
    t = t.replace("ž", "y")
    # lj, nj, j
    t = re.sub(r"\blj", "ll", t); t = t.replace("lj","ll")
    t = re.sub(r"\bnj", "ñ", t);  t = t.replace("nj","ñ")
    t = t.replace("j","y")
    # h (issu de х) → j dans environnements vocaliques
    t = re.sub(r"\b[hH](?=[%s])" % VOWELS, "j", t)
    t = re.sub(r"(?<=[%s])[hH](?=[%s])" % (VOWELS,VOWELS), "j", t)
    # k/g devant e,i → qu/gu
    t = re.sub(r"\bk(?=[eiéí])", "qu", t); t = re.sub(r"(?<=[^a-zA-Z])k(?=[eiéí])", "qu", t)
    t = re.sub(r"\bg(?=[eiéí])", "gu", t); t = re.sub(r"(?<=[^a-zA-Z])g(?=[eiéí])", "gu", t)
    # ailleurs, k → c (optionnel, plus “espagnol”)
    t = re.sub(r"\bk", "c", t);  t = re.sub(r"(?<=[^a-zA-Z])k", "c", t)
    return normalize_ws(t)

def mk_to_es_proxy(text:str)->str:
    t = strip_punct(text)
    t = mk_cyr_to_basic_latin(t)
    t = apply_spanish_proxy_rules(t)
    return t

written = 0
for f in sorted(IN_DIR.glob("*.txt")):
    raw = f.read_text(encoding="utf-8", errors="ignore")
    prox = mk_to_es_proxy(raw)
    out = OUT_DIR / f.name.replace("_cyr.txt", "_proxy_es.txt")
    out.write_text(prox + "\n", encoding="utf-8")
    written += 1
    print(f"Wrote {out.name} ({len(prox)} chars)")
print(f"\n✅ Done. Wrote {written} proxy TXT file(s) to {OUT_DIR}")
