In [1]:
def check_ann_indices(txt_path, ann_path):
    with open(txt_path, "r", encoding="utf-8") as f:
        text = f.read()

    with open(ann_path, "r", encoding="utf-8") as f:
        annotations = f.readlines()

    has_errors = False

    for line in annotations:
        if not line.startswith("T"):
            continue  # skip non-entity lines
        parts = line.strip().split("\t")
        if len(parts) != 4:
            print(f"Invalid line format: {line}")
            continue

        tid, tag_info, inds, phrase = parts
        # tag_parts = tag_info.split()
        # if len(tag_parts) < 5:
        #     print(f"Invalid tag info: {tag_info}")
        #     continue

        label = phrase
        start = int(inds.split(" ")[0])
        end = int(inds.split(" ")[1])
        extracted = text[start:end]

        if extracted != phrase:
            has_errors = True
            print(f"❌ Mismatch in {tid} ({label}):")
            print(f"  Annotated: '{phrase}'")
            print(f"  Extracted : '{extracted}'")
            print(f"  Location  : {start}-{end}\n")
        else:
            print(extracted, phrase)

    if not has_errors:
        print("✅ All annotation spans match the text.")

# Example usage:

In [14]:
check_ann_indices("/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped/data/bruk/0ac5140eb732-swapped.txt", "/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped/data/bruk/0ac5140eb732-swapped.ann")

докторка докторка
Анґер Анґер
Марининими Марининими
Докторки Докторки
Олексій Олексій
Аделя Аделя
докторкою докторкою
Анґер Анґер
докторка докторка
✅ All annotation spans match the text.


In [None]:
import os

def check_all_annotations_in_dir(directory):
    files = os.listdir(directory)
    txt_files = [f for f in files if f.endswith(".txt")]

    for txt_file in txt_files:
        base_name = os.path.splitext(txt_file)[0]
        ann_file = base_name + ".ann"
        txt_path = os.path.join(directory, txt_file)
        ann_path = os.path.join(directory, ann_file)

        if not os.path.exists(ann_path):
            print(f"⚠️ Missing annotation file for {txt_file}")
            continue

        # print(f"\n📄 Checking: {txt_file} & {ann_file}")
        check_ann_indices(txt_path, ann_path)

def check_ann_indices(txt_path, ann_path):
    with open(txt_path, "r", encoding="utf-8") as f:
        text = f.read()

    with open(ann_path, "r", encoding="utf-8") as f:
        annotations = f.readlines()

    has_errors = False

    for line in annotations:
        if not line.startswith("T"):
            continue  # skip non-entity lines
        parts = line.strip().split("\t")
        if len(parts) != 5:
            print(txt_path)
            print(f"Invalid line format: {line}")
            continue

        tid, tag_info, start, end, phrase = parts
        start = int(start)
        end = int(end)
        # tag_parts = tag_info.split()
        # if len(tag_parts) < 5:
        #     print(f"Invalid tag info: {tag_info}")
        #     continue

        label = phrase
        extracted = text[start:end]

        if extracted != phrase:
            has_errors = True
            print(f"❌ Mismatch in {tid} ({label}):")
            print(f"  Annotated: '{phrase}'")
            print(f"  Extracted : '{extracted}'")
            print(f"  Location  : {start}-{end}\n")

    # if not has_errors:
    #     print("✅ All annotation spans match the text.")

# Example usage:


In [26]:
check_all_annotations_in_dir("/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/bruk")


3
Invalid line format: T19	JOB	1017 1024 писарки

3
Invalid line format: T1	JOB 41 51	мікологині

3
Invalid line format: T2	DATE 170 178	1859 р .

3
Invalid line format: T3	JOB 198 207	ботанічка

3
Invalid line format: T4	JOB 211 221	мікологиня

3
Invalid line format: T5	PERS 222 233	А . де Барі

3
Invalid line format: T9	ORG	600 638 Інституту української мови НАН України



In [32]:
check_all_annotations_in_dir("/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped_filtering/data/ng")


In [44]:
import os
import re

def fix_ann_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    fixed_lines = []
    for line in lines:
        # Match lines in the format: T# LABEL START END PHRASE
        parts = line.strip().split("\t")

        # print(parts)
        # print(len(parts))
        
        if len(parts) == 4:
            t, lab, inds, phr = parts
            # parts[2] is the start and end indices, e.g., "242 247"
            # indices = parts[2]
            start_ind, end_ind = inds.split(" ")
            # Replace space with tab between indices
            # parts[2] = indices.replace(" ", "\t")
            fixed_line = "\t".join([t, lab, start_ind, end_ind, phr])
            # print(fixed_line)
            fixed_lines.append(fixed_line)
        else:
            fixed_lines.append(line)

    with open(file_path, "w", encoding="utf-8") as f:
        for fixed_line in fixed_lines:
            f.write(fixed_line + "\n")

    print(f"✅ Fixed: {os.path.basename(file_path)}")

def fix_all_ann_files_in_dir(directory):
    for file_name in os.listdir(directory):
        if file_name.endswith(".ann"):
            fix_ann_file(os.path.join(directory, file_name))

# Example usage:
# fix_all_ann_files_in_dir("/path/to/your/ann_folder")


In [46]:
fix_all_ann_files_in_dir("/Users/linndfors/study/diploma/ner_for_fem/data/v2.0-swapped/data/ng")

✅ Fixed: 989b35bbc2b6-swapped.ann
✅ Fixed: 5e33850771e3-swapped.ann
✅ Fixed: 1c48b2f37af3-swapped.ann
✅ Fixed: 0050229d8534-swapped.ann
✅ Fixed: 150446f83aa2-swapped.ann
✅ Fixed: 00edded01d7f-swapped.ann
✅ Fixed: f50037706d0a-swapped.ann
✅ Fixed: 6d47a8c4d755-swapped.ann
✅ Fixed: 5e407e3ddb68-swapped.ann
✅ Fixed: 0e5456794c26-swapped.ann
✅ Fixed: cc9eda5dc1f3-swapped.ann
✅ Fixed: ac13bf912fcb-swapped.ann
✅ Fixed: 432e298a1354-swapped.ann
✅ Fixed: 23fed9919583-swapped.ann
✅ Fixed: 17f81040ab50-swapped.ann
✅ Fixed: 176e63516fab-swapped.ann
✅ Fixed: c446d7d1cea2-swapped.ann
✅ Fixed: f62160cb3ef6-swapped.ann
✅ Fixed: 8e66b8ea5086-swapped.ann
✅ Fixed: f8ce14d569d1-swapped.ann
✅ Fixed: 4ea65724096f-swapped.ann
✅ Fixed: e2e03aba8da3-swapped.ann
✅ Fixed: fc3221dac69d-swapped.ann
✅ Fixed: 1e8b049462e1-swapped.ann
✅ Fixed: de4c2d20b24b-swapped.ann
✅ Fixed: 7982679365fb-swapped.ann
✅ Fixed: d81632a877d7-swapped.ann
✅ Fixed: d4a86a1591b4-swapped.ann
✅ Fixed: 1e9389b05e86-swapped.ann
✅ Fixed: bdb88