<a href="https://colab.research.google.com/github/lcontrerasroa/macedonian/blob/main/notebooks/EAF2MAUS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @title Fetch .eaf files directly from GitHub
import os
from pathlib import Path

# URL du dépot
REPO_URL = "https://github.com/lcontrerasroa/macedonian.git"
TARGET_DIR = Path("/content/eaf_new")

if not TARGET_DIR.exists():
    os.system(f"git clone --depth 1 {REPO_URL} /content/macedonian")
    os.makedirs(TARGET_DIR, exist_ok=True)
    # copie uniquement les .eaf du dossier data/eaf_new dans eaf_new
    os.system("cp /content/macedonian/data/eaf_new/*.eaf /content/eaf_new/")
    print("✅ .eaf files imported from GitHub into /content/eaf_new")
else:
    print("Folder /content/eaf_new already exists — skipping clone.")

# vérification
!ls /content/eaf_new


✅ .eaf files imported from GitHub into /content/eaf_new
Macedonian_decata_so_zlatna_kosa_new.eaf  Macedonian_pepelaska_new.eaf
Macedonian_duhot_od_grobot_new.eaf	  Macedonian_petle_new.eaf
Macedonian_dva_braka_new.eaf		  Macedonian_praseto_i_zeladite_new.eaf
Macedonian_itar_pejo_new.eaf		  Macedonian_prdlivata_nevesta_new.eaf
Macedonian_kralevic_marko_new.eaf	  Macedonian_prosti_new.eaf
Macedonian_kusa_new.eaf			  Macedonian_role_i_rolejca_new.eaf
Macedonian_makata_new.eaf		  Macedonian_siljan_strkot_new.eaf
Macedonian_masha_new.eaf		  Macedonian_svadbata_new.eaf
Macedonian_mrzlivata_zena_new.eaf	  Macedonian_tri_sestri_new.eaf
Macedonian_najdenko_new.eaf		  Macedonian_trnova_ruzica_new.eaf
Macedonian_narecnicite_new.eaf		  Macedonian_volkot_kum_new.eaf
Macedonian_ovcarot_new.eaf		  Macedonian_zlatnoto_runo_new.eaf
Macedonian_ovenot_i_kozata_new.eaf


In [2]:
# @title Inspect the internal tier structure of all EAF files
!pip -q install pympi-ling

from pathlib import Path
import pympi
import re

# === Paths ===
root = Path("/content/eaf_new")  # dossier contenant les *_new.eaf
assert root.exists(), "The folder /content/eaf_new doesn't exist. Upload or unzip your .eaf files first."

# === Function to summarize one EAF ===
def summarize_eaf(path):
    eaf = pympi.Elan.Eaf(str(path))
    tiers = eaf.get_tier_names()
    info = []
    for t in tiers:
        anns = eaf.get_annotation_data_for_tier(t)
        non_empty = [a for a in anns if a[2] and str(a[2]).strip()]
        info.append({
            "tier": t,
            "annotations": len(anns),
            "non_empty": len(non_empty)
        })
    return info

# === Loop over all .eaf files ===
results = []
for eaf_file in sorted(root.glob("*.eaf")):
    try:
        tiers = summarize_eaf(eaf_file)
        results.append((eaf_file.name, tiers))
    except Exception as e:
        results.append((eaf_file.name, f"ERROR: {e}"))

# === Display ===
for name, tiers in results:
    print(f"\n=== {name} ===")
    if isinstance(tiers, str):
        print("  ", tiers)
        continue
    for t in tiers:
        tname = t["tier"]
        nn = t["non_empty"]
        tot = t["annotations"]
        pct = (nn / tot * 100) if tot else 0
        print(f"  - {tname:30s} {nn:4d}/{tot:<4d} ({pct:5.1f}%) non-empty")



=== Macedonian_decata_so_zlatna_kosa_new.eaf ===
  - ref@SP1                         257/257  (100.0%) non-empty
  - rp@SP1                          109/109  (100.0%) non-empty
  - comm                              2/2    (100.0%) non-empty
  - qt@SP1                           61/61   (100.0%) non-empty
  - ft@SP1                          257/257  (100.0%) non-empty
  - lit@SP1                           0/0    (  0.0%) non-empty
  - tx@SP1                          257/257  (100.0%) non-empty
  - not@SP1                           4/4    (100.0%) non-empty
  - typ@SP1                          67/67   (100.0%) non-empty
  - mot@SP1                        1014/1014 (100.0%) non-empty
  - wps@SP1                         998/1014 ( 98.4%) non-empty
  - mb@SP1                         1319/1319 (100.0%) non-empty
  - ge@SP1                         1319/1319 (100.0%) non-empty
  - ps@SP1                         1319/1319 (100.0%) non-empty
  - par@SP1                          71/71   (100.0%) 

In [None]:
# @title Extract Cyrillic TXT per speaker from tx_cyr@SPx (fallback to tx@SPx) — robust tuples
!pip -q install pympi-ling

import re
from pathlib import Path
import pympi

SRC_DIR = Path("/content/eaf_new")           # tes .eaf
OUT_DIR = Path("/content/out_txt_cyr")       # txt cyrilliques
OUT_DIR.mkdir(parents=True, exist_ok=True)

tier_re_tx_cyr = re.compile(r"^tx_cyr@SP(\d+)$", re.I)
tier_re_tx     = re.compile(r"^tx@SP(\d+)$", re.I)

def get_speaker_tiers(eaf):
    names = eaf.get_tier_names()
    cyr = [(t, int(m.group(1))) for t in names if (m:=tier_re_tx_cyr.match(t))]
    lat = [(t, int(m.group(1))) for t in names if (m:=tier_re_tx.match(t))]
    # index par SP: préfère tx_cyr, sinon tx
    speakers = {}
    for t, sp in lat:
        speakers.setdefault(sp, ("tx@SP%d" % sp, t))
    for t, sp in cyr:
        speakers[sp] = ("tx_cyr@SP%d" % sp, t)
    chosen = []
    for sp, pair in sorted(speakers.items()):
        tag, tname = pair
        chosen.append((tname, sp, tag.startswith("tx_cyr")))
    return chosen

def _normalize_ann_tuple(item):
    """
    Accepte (start, end, value) ou (ann_id, start, end, value).
    Retourne (start, end, value) avec start/end int, value str.
    Ignore les autres excentricités.
    """
    if not isinstance(item, (list, tuple)):
        return None
    if len(item) == 3:
        s, e, v = item
    elif len(item) >= 4:
        s, e, v = item[-3], item[-2], item[-1]
    else:
        return None
    try:
        s = int(s); e = int(e)
    except Exception:
        # si ce sont des time slots id plutôt que ms, on laisse tel quel pour le tri
        pass
    return s, e, "" if v is None else str(v)

def concat_intervals(eaf, tiername):
    raw = eaf.get_annotation_data_for_tier(tiername)
    norm = []
    for it in raw:
        tup = _normalize_ann_tuple(it)
        if tup is None:
            continue
        s, e, v = tup
        if v.strip():
            norm.append((s, e, v.strip()))
    # tri temporal
    norm.sort(key=lambda x: (x[0], x[1]))
    # espace après chaque intervalle
    return "".join(v + " " for _,__,v in norm).strip()

count_files = 0
for eaf_path in sorted(SRC_DIR.glob("*.eaf")):
    eaf = pympi.Elan.Eaf(str(eaf_path))
    for tiername, sp, is_cyr in get_speaker_tiers(eaf):
        txt = concat_intervals(eaf, tiername)
        if not txt:
            continue
        stem = eaf_path.stem
        out = OUT_DIR / f"{stem}_SP{sp}_cyr.txt"
        out.write_text(txt, encoding="utf-8")
        count_files += 1
        print(f"Wrote {out.name} ({len(txt)} chars)")
print(f"\n✅ Done. Wrote {count_files} TXT file(s) to {OUT_DIR}")


In [None]:
# @title Inspect each EAF: tier overview with sample content
!pip -q install pympi-ling pandas

import pandas as pd
from pathlib import Path
import pympi

SRC_DIR = Path("/content/eaf_new")  # adapte si besoin
OUT_DIR = Path("/content/eaf_tier_summary")
OUT_DIR.mkdir(exist_ok=True)

def sample_value(annotations):
    for a in annotations:
        if isinstance(a, (list, tuple)):
            val = a[-1] if a else ""
        else:
            val = a
        if isinstance(val, str) and val.strip():
            txt = val.strip().replace("\n", " ")
            return (txt[:80] + "…") if len(txt) > 80 else txt
    return ""

def summarize_eaf_tiers(eaf_path):
    eaf = pympi.Elan.Eaf(str(eaf_path))
    data = []
    for tier in eaf.get_tier_names():
        anns = eaf.get_annotation_data_for_tier(tier)
        non_empty = [a for a in anns if (isinstance(a, (list, tuple)) and len(a) > 0 and str(a[-1]).strip())]
        sample = sample_value(non_empty)
        data.append({
            "tier": tier,
            "annotations": len(anns),
            "non_empty": len(non_empty),
            "sample": sample
        })
    df = pd.DataFrame(data)
    df = df.sort_values("tier")
    return df

summaries = {}
for eaf_path in sorted(SRC_DIR.glob("*.eaf")):
    df = summarize_eaf_tiers(eaf_path)
    summaries[eaf_path.name] = df
    out_csv = OUT_DIR / f"{eaf_path.stem}_tiers.csv"
    df.to_csv(out_csv, index=False, encoding="utf-8")
    print(f"✅ {eaf_path.name}: {len(df)} tiers → {out_csv.name}")

print(f"\nAll CSV summaries saved in: {OUT_DIR}")


In [None]:
# @title Inspect EAF tier hierarchy (works with tuple-structured tiers)
!pip -q install pympi-ling
from pathlib import Path
import pympi

path = Path("/content/eaf_new/Macedonian_decata_so_zlatna_kosa_new.eaf")
eaf = pympi.Elan.Eaf(str(path))

print(f"File: {path.name}\n")
print(f"{'Tier':35s} {'Parent':20s} {'Children'}")
print("-" * 80)

# extraction prudente de PARENT_REF
def safe_parent(info):
    if isinstance(info, dict):
        return info.get("PARENT_REF")
    if isinstance(info, (list, tuple)):
        for item in info:
            if isinstance(item, dict) and "PARENT_REF" in item:
                return item["PARENT_REF"]
    return None

# construire map parent → enfants
child_map = {}
parent_map = {}
for t, info in eaf.tiers.items():
    parent = safe_parent(info)
    if parent:
        parent_map[t] = parent
        child_map.setdefault(parent, []).append(t)

# affichage hiérarchique
for t in eaf.get_tier_names():
    parent = parent_map.get(t, "-")
    children = ", ".join(child_map.get(t, [])) or "-"
    print(f"{t:35s} {parent:20s} {children}")


In [None]:
# @title Debug tier names in one EAF
from pathlib import Path
import pympi

eaf_path = Path("/content/eaf_new/Macedonian_decata_so_zlatna_kosa_new.eaf")
eaf = pympi.Elan.Eaf(str(eaf_path))
print("Tier names in this file:\n")
for t in eaf.get_tier_names():
    anns = eaf.get_annotation_data_for_tier(t)
    non_empty = [a for a in anns if isinstance(a, (list, tuple)) and str(a[-1]).strip()]
    sample = (non_empty[0][-1] if non_empty else "")[:60]
    print(f"{t:30s} | annots={len(anns):4d} | non_empty={len(non_empty):4d} | sample={sample}")


In [None]:
# @title Compare _new vs original EAF tier content
from pathlib import Path
import pympi

base = Path("/content/eaf_new")  # dossier où tu as mis tes .eaf

# choisis un conte qu'on connaît :
stem = "Macedonian_decata_so_zlatna_kosa"
orig = base / f"{stem}.eaf"
new  = base / f"{stem}_new.eaf"

def tier_sample(path):
    eaf = pympi.Elan.Eaf(str(path))
    data = {}
    for t in eaf.get_tier_names():
        anns = eaf.get_annotation_data_for_tier(t)
        non_empty = [a for a in anns if isinstance(a, (list, tuple)) and str(a[-1]).strip()]
        if not non_empty:
            continue
        val = str(non_empty[0][-1])[:60]
        data[t] = val
    return data

def diff_eaf(a, b):
    print(f"--- {a.name} ---")
    da = tier_sample(a)
    for k,v in sorted(da.items()):
        print(f"{k:25s}: {v}")
    print(f"\n--- {b.name} ---")
    db = tier_sample(b)
    for k,v in sorted(db.items()):
        print(f"{k:25s}: {v}")

diff_eaf(orig, new)


In [25]:
from pathlib import Path
import xml.etree.ElementTree as ET

sample = Path("/content/eaf_orig/Macedonian_dva_braka.eaf")
print(f"Inspecting {sample.name}")

tree = ET.parse(sample)
root = tree.getroot()

# findall sans namespace
tiers = root.findall(".//TIER")

print(f"Found {len(tiers)} tiers\n")

for tier in tiers:
    tid = tier.attrib.get("TIER_ID", "")
    ling = tier.attrib.get("LINGUISTIC_TYPE_REF", "")
    vals = [v.text.strip() for v in tier.findall(".//ANNOTATION_VALUE") if v.text and v.text.strip()]
    preview = " | ".join(vals[:3])
    print(f"{tid:25s} ({ling:10s})  →  {len(vals)} vals  |  {preview}")


Inspecting Macedonian_dva_braka.eaf
Found 30 tiers

ref@SP1                   (ref       )  →  72 vals  |  Macedonian_dva_braka.01 | Macedonian_dva_braka.02 | Macedonian_dva_braka.04
rp@SP1                    (rp        )  →  62 vals  |  DR Event | Discourse Report | DR Event
comm                      (ref       )  →  6 vals  |  DR Event + DR + Quot + DR | the actual report doesn't come until later | vaka i vaka = so and so (pronoun for reported speech)
qt@SP1                    (qt        )  →  31 vals  |  DR Event + Discourse Report | Other | DR Event + Discourse Report
ref@SP2                   (ref       )  →  1 vals  |  Macedonian_dva_braka.03
rp@SP2                    (rp        )  →  0 vals  |  
qt@SP2                    (qt        )  →  0 vals  |  
comm@SP2                  (ref       )  →  0 vals  |  
ft@SP1                    (ft        )  →  71 vals  |  Shall I start? | Should I say again I am grandma Mare? | Yеs.
lit@SP1                   (lit       )  →  2 vals  |  When he

In [27]:
# @title Extract only tx@SPx and tx_cyr@SPx text (in order of appearance)
from pathlib import Path
import xml.etree.ElementTree as ET
import re
import shutil

SRC_DIR = Path("/content/eaf_orig")  # dossier des fichiers originaux
OUT_DIR = Path("/content/out_txt_cyr")

# ⚠️ Vider le dossier de sortie avant d'écrire les nouveaux fichiers
if OUT_DIR.exists():
    shutil.rmtree(OUT_DIR)
OUT_DIR.mkdir(parents=True, exist_ok=True)

tier_pattern = re.compile(r"^(tx(_cyr)?@SP\d+)$", re.I)

written = 0

for eaf_file in sorted(SRC_DIR.glob("*.eaf")):
    root = ET.parse(eaf_file).getroot()
    for tier in root.findall(".//TIER"):
        tid = tier.attrib.get("TIER_ID", "")
        if not tier_pattern.match(tid):
            continue
        vals = [v.text.strip() for v in tier.findall(".//ANNOTATION_VALUE") if v.text and v.text.strip()]
        if not vals:
            continue
        txt = " ".join(vals)
        suffix = "cyr" if "tx_cyr" in tid.lower() else "lat"
        out = OUT_DIR / f"{eaf_file.stem}_{tid}_{suffix}.txt"
        out.write_text(txt + "\n", encoding="utf-8")
        written += 1
        print(f"Wrote {out.name} ({len(txt)} chars, {len(vals)} segments)")

print(f"\n✅ Done. Wrote {written} files in {OUT_DIR}")


Wrote Macedonian_decata_so_zlatna_kosa_tx@SP1_lat.txt (5189 chars, 257 segments)
Wrote Macedonian_decata_so_zlatna_kosa_tx_cyr@SP1_cyr.txt (5170 chars, 257 segments)
Wrote Macedonian_duhot_od_grobot_tx@SP1_lat.txt (5600 chars, 196 segments)
Wrote Macedonian_duhot_od_grobot_tx_cyr@SP1_cyr.txt (5573 chars, 196 segments)
Wrote Macedonian_dva_braka_tx@SP1_lat.txt (2027 chars, 71 segments)
Wrote Macedonian_dva_braka_tx@SP2_lat.txt (17 chars, 1 segments)
Wrote Macedonian_dva_braka_tx_cyr@SP1_cyr.txt (2026 chars, 71 segments)
Wrote Macedonian_dva_braka_tx_cyr@SP2_cyr.txt (17 chars, 1 segments)
Wrote Macedonian_itar_pejo_tx@SP1_lat.txt (1297 chars, 45 segments)
Wrote Macedonian_itar_pejo_tx@SP2_lat.txt (41 chars, 2 segments)
Wrote Macedonian_itar_pejo_tx_cyr@SP1_cyr.txt (1295 chars, 45 segments)
Wrote Macedonian_itar_pejo_tx_cyr@SP2_cyr.txt (41 chars, 2 segments)
Wrote Macedonian_kralevic_marko_tx@SP1_lat.txt (845 chars, 30 segments)
Wrote Macedonian_kralevic_marko_tx@SP2_lat.txt (23 chars, 1 

In [None]:
# @title Convert Cyrillic TXT → Spanish-like proxy (mk→es)
import re, unicodedata
from pathlib import Path

IN_DIR  = Path("/content/out_txt_cyr")
OUT_DIR = Path("/content/out_txt_proxy")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# --- cyr → latin de base (macédonien) ---
MK_CYR_TO_LAT = {
    "Ќ":"ḱ","ќ":"ḱ","Ѓ":"ǵ","ѓ":"ǵ","Ж":"ž","ж":"ž","З":"z","з":"z","Ѕ":"dz","ѕ":"dz",
    "Ч":"č","ч":"č","Џ":"dž","џ":"dž","Ш":"š","ш":"š","Ј":"j","ј":"j","Љ":"lj","љ":"lj",
    "Њ":"nj","њ":"nj","А":"a","а":"a","Б":"b","б":"b","В":"v","в":"v","Г":"g","г":"g",
    "Д":"d","д":"d","Е":"e","е":"e","И":"i","и":"i","К":"k","к":"k","Л":"l","л":"l",
    "М":"m","м":"m","Н":"n","н":"n","О":"o","о":"o","П":"p","п":"p","Р":"r","р":"r",
    "С":"s","с":"s","Т":"t","т":"t","У":"u","у":"u","Ф":"f","ф":"f","Х":"h","х":"h"
}
def mk_cyr_to_basic_latin(text:str)->str:
    return "".join(MK_CYR_TO_LAT.get(ch, ch) for ch in text)

# --- règles proxy espagnol (avec z→s et dz→ds comme demandé) ---
VOWELS = "aeiouáéíóúüAEIOUÁÉÍÓÚÜ"
def normalize_ws(t): return re.sub(r"\s+", " ", t).strip()
def strip_punct(t):
    t = re.sub(r"[„“”«»\"“”]", "", t)
    t = re.sub(r"[—–…]", " ", t)
    t = re.sub(r"[.,;:!?()\[\]{}/\\]", " ", t)
    return normalize_ws(t)

def apply_spanish_proxy_rules(text:str)->str:
    t = text
    # palatales
    t = t.replace("ḱ", "ky")
    t = t.replace("ǵ", "y")
    # affriquées et sibilantes
    t = t.replace("dž", "y")     # /dʒ/
    t = t.replace("dz", "ds")    # <- ta contrainte
    t = t.replace("č", "ch")
    t = t.replace("š", "s")
    t = t.replace("ž", "y")
    # lj, nj, j
    t = re.sub(r"\blj", "ll", t); t = t.replace("lj","ll")
    t = re.sub(r"\bnj", "ñ", t);  t = t.replace("nj","ñ")
    t = t.replace("j","y")
    # h (issu de х) → j dans environnements vocaliques
    t = re.sub(r"\b[hH](?=[%s])" % VOWELS, "j", t)
    t = re.sub(r"(?<=[%s])[hH](?=[%s])" % (VOWELS,VOWELS), "j", t)
    # k/g devant e,i → qu/gu
    t = re.sub(r"\bk(?=[eiéí])", "qu", t); t = re.sub(r"(?<=[^a-zA-Z])k(?=[eiéí])", "qu", t)
    t = re.sub(r"\bg(?=[eiéí])", "gu", t); t = re.sub(r"(?<=[^a-zA-Z])g(?=[eiéí])", "gu", t)
    # ailleurs, k → c (optionnel, plus “espagnol”)
    t = re.sub(r"\bk", "c", t);  t = re.sub(r"(?<=[^a-zA-Z])k", "c", t)
    return normalize_ws(t)

def mk_to_es_proxy(text:str)->str:
    t = strip_punct(text)
    t = mk_cyr_to_basic_latin(t)
    t = apply_spanish_proxy_rules(t)
    return t

written = 0
for f in sorted(IN_DIR.glob("*.txt")):
    raw = f.read_text(encoding="utf-8", errors="ignore")
    prox = mk_to_es_proxy(raw)
    out = OUT_DIR / f.name.replace("_cyr.txt", "_proxy_es.txt")
    out.write_text(prox + "\n", encoding="utf-8")
    written += 1
    print(f"Wrote {out.name} ({len(prox)} chars)")
print(f"\n✅ Done. Wrote {written} proxy TXT file(s) to {OUT_DIR}")
