<a href="https://colab.research.google.com/github/lcontrerasroa/macedonian/blob/main/notebooks/EAF2MAUS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# @title Fetch .eaf files directly from GitHub
import os
from pathlib import Path

# ton dépôt (change si besoin)
REPO_URL = "https://github.com/lcontrerasroa/macedonian.git"
TARGET_DIR = Path("/content/eaf_new")

if not TARGET_DIR.exists():
    os.system(f"git clone --depth 1 {REPO_URL} /content/macedonian")
    os.makedirs(TARGET_DIR, exist_ok=True)
    # copie uniquement les .eaf du dossier data/eaf_new dans eaf_new
    os.system("cp /content/macedonian/data/eaf_new/*.eaf /content/eaf_new/")
    print("✅ .eaf files imported from GitHub into /content/eaf_new")
else:
    print("Folder /content/eaf_new already exists — skipping clone.")

# vérification
!ls /content/eaf_new


Folder /content/eaf_new already exists — skipping clone.
new_only


In [6]:
# @title Inspect the internal tier structure of all EAF files
!pip -q install pympi-ling

from pathlib import Path
import pympi
import re

# === Paths ===
root = Path("/content/eaf_new")  # dossier contenant les *_new.eaf
assert root.exists(), "The folder /content/eaf_new doesn't exist. Upload or unzip your .eaf files first."

# === Function to summarize one EAF ===
def summarize_eaf(path):
    eaf = pympi.Elan.Eaf(str(path))
    tiers = eaf.get_tier_names()
    info = []
    for t in tiers:
        anns = eaf.get_annotation_data_for_tier(t)
        non_empty = [a for a in anns if a[2] and str(a[2]).strip()]
        info.append({
            "tier": t,
            "annotations": len(anns),
            "non_empty": len(non_empty)
        })
    return info

# === Loop over all .eaf files ===
results = []
for eaf_file in sorted(root.glob("*.eaf")):
    try:
        tiers = summarize_eaf(eaf_file)
        results.append((eaf_file.name, tiers))
    except Exception as e:
        results.append((eaf_file.name, f"ERROR: {e}"))

# === Display ===
for name, tiers in results:
    print(f"\n=== {name} ===")
    if isinstance(tiers, str):
        print("  ", tiers)
        continue
    for t in tiers:
        tname = t["tier"]
        nn = t["non_empty"]
        tot = t["annotations"]
        pct = (nn / tot * 100) if tot else 0
        print(f"  - {tname:30s} {nn:4d}/{tot:<4d} ({pct:5.1f}%) non-empty")



=== Macedonian_decata_so_zlatna_kosa_new.eaf ===
  - ref@SP1                         257/257  (100.0%) non-empty
  - rp@SP1                          109/109  (100.0%) non-empty
  - comm                              2/2    (100.0%) non-empty
  - qt@SP1                           61/61   (100.0%) non-empty
  - ft@SP1                          257/257  (100.0%) non-empty
  - lit@SP1                           0/0    (  0.0%) non-empty
  - tx@SP1                          257/257  (100.0%) non-empty
  - not@SP1                           4/4    (100.0%) non-empty
  - typ@SP1                          67/67   (100.0%) non-empty
  - mot@SP1                        1014/1014 (100.0%) non-empty
  - wps@SP1                         998/1014 ( 98.4%) non-empty
  - mb@SP1                         1319/1319 (100.0%) non-empty
  - ge@SP1                         1319/1319 (100.0%) non-empty
  - ps@SP1                         1319/1319 (100.0%) non-empty
  - par@SP1                          71/71   (100.0%) 