In [8]:
import os
import re
from shutil import copyfile

def extract_entities(txt_file, ann_file):
    # Read the txt file
    with open(txt_file, 'r', encoding='utf-8') as f:
        txt_content = f.read()
    
    # Read the ann file
    with open(ann_file, 'r', encoding='utf-8') as f:
        ann_lines = f.readlines()
    
    # Extract the entities and their ranges from the ann file
    entities = []
    for line in ann_lines:
        parts = line.split("\t")
        entity_type = parts[1]
        start, end = map(int, parts[2].split())
        entity_text = txt_content[start:end]
        entities.append((entity_type, entity_text, start, end))
    
    # Sort entities by their start position in the text to maintain order
    entities.sort(key=lambda x: x[2])
    
    # Create the modified txt content with only the entities
    modified_txt_content = " ".join([entity[1] for entity in entities])

    return modified_txt_content, entities

def update_ann_file(ann_file, entities, modified_txt_content):
    # Create a new .ann file with updated indices
    new_ann_lines = []
    new_start = 0
    
    for entity_type, entity_text, old_start, old_end in entities:
        # Find the new start and end indices based on the modified text
        new_end = new_start + len(entity_text)
        new_ann_lines.append(f"T{len(new_ann_lines)+1}\t{entity_type} {new_start} {new_end}\t{entity_text}\n")
        new_start = new_end + 1  # add a space between entities in the modified text
    
    return new_ann_lines

def process_files(txt_file, ann_file, output_txt_dir, output_ann_dir):
    # Extract entities and modified text
    modified_txt_content, entities = extract_entities(txt_file, ann_file)
    
    # Create the new .txt filename in the output directory
    output_txt_file = os.path.join(output_txt_dir, os.path.basename(txt_file))
    with open(output_txt_file, 'w', encoding='utf-8') as f:
        f.write(modified_txt_content)

    # Update the .ann file and save it
    new_ann_lines = update_ann_file(ann_file, entities, modified_txt_content)
    output_ann_file = os.path.join(output_ann_dir, os.path.basename(ann_file))
    with open(output_ann_file, 'w', encoding='utf-8') as f:
        f.writelines(new_ann_lines)

def main(input_txt_dir, input_ann_dir, output_txt_dir, output_ann_dir):
    # Ensure output directories exist
    os.makedirs(output_txt_dir, exist_ok=True)
    os.makedirs(output_ann_dir, exist_ok=True)
    
    # Iterate through .txt files and process the corresponding .ann files
    for txt_filename in os.listdir(input_txt_dir):
        try:
            if txt_filename.endswith('.txt'):
                txt_file = os.path.join(input_txt_dir, txt_filename)
                ann_file = os.path.join(input_ann_dir, txt_filename.replace('.txt', '.ann'))
                
                # Check if the corresponding .ann file exists
                if os.path.exists(ann_file):
                    process_files(txt_file, ann_file, output_txt_dir, output_ann_dir)
                    print(f"Processed: {txt_filename}")
                else:
                    print(f"Warning: No corresponding .ann file for {txt_filename}")
        except Exception as e:
            print(e, "::::", txt_file)

In [6]:
len("начальниця управління масових і спортивних заходів департаменту громадської безпеки ")

84

In [None]:
# Define your input and output directories
input_txt_dir = '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng_changed'  # Replace with the path to your .txt files directory
input_ann_dir = '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/ng_changed'  # Replace with the path to your .ann files directory
output_txt_dir = '/Users/linndfors/study/diploma/ner_for_fem/src/ng_changed'  # Replace with the path to your output .txt files directory
output_ann_dir = '/Users/linndfors/study/diploma/ner_for_fem/src/ng_changed'  # Replace with the path to your output .ann files directory

# Run the main function
main(input_txt_dir, input_ann_dir, output_txt_dir, output_ann_dir)

Processed: 8e66b8ea5086_1.txt
Processed: 26c531a355cc_1.txt
Processed: 09e1dccb8839_1.txt
Processed: 3dfca2b9b104_1.txt
Processed: 4bdc94abaa1c_1.txt
Processed: cc9eda5dc1f3_1.txt
Processed: 3b285211209c_1.txt
Processed: 78153c6d7c59_1.txt
Processed: 07c51f631d18_1.txt
Processed: fe12a985f7cd_1.txt
Processed: d6ca19c4065e_1.txt
Processed: 87deabcc8f95_1.txt
Processed: 5a43160b965d_1.txt
Processed: d7e523681acc_1.txt
Processed: 258d2fd0d2fe_1.txt
Processed: 52a055482925_1.txt
Processed: 29ee46b3128e_1.txt
Processed: 9690987f7b71_1.txt
Processed: 5e407e3ddb68_1.txt
Processed: 5d3d7e0d5bae_1.txt
Processed: 4ea65724096f_1.txt
Processed: 47713afd9490_1.txt
Processed: 1b68c2c76541_1.txt
Processed: 2980d5c34788_1.txt
Processed: 8c6db2873e45_1.txt
Processed: 19a095d6ed15_1.txt
Processed: 236f916f3d6f_1.txt
Processed: 46b3a16d4154_1.txt
Processed: fc3221dac69d_1.txt
Processed: 16fb31bf76b8_1.txt
Processed: 104263660695_1.txt
Processed: c97bff36f45a_1.txt
Processed: 0f0e96425eed_1.txt
Processed:

In [10]:
# Define your input and output directories
input_txt_dir = '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk_changed'  # Replace with the path to your .txt files directory
input_ann_dir = '/Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk_changed'  # Replace with the path to your .ann files directory
output_txt_dir = '/Users/linndfors/study/diploma/ner_for_fem/src/bruk_changed'  # Replace with the path to your output .txt files directory
output_ann_dir = '/Users/linndfors/study/diploma/ner_for_fem/src/bruk_changed'  # Replace with the path to your output .ann files directory

# Run the main function
main(input_txt_dir, input_ann_dir, output_txt_dir, output_ann_dir)

Processed: 09d6a57532b6_1.txt
Processed: bf4968831cc7_1.txt
Processed: bc755153f4f7_1.txt
Processed: 1ed3fef56c8f_1.txt
Processed: dac0de834f47_1.txt
Processed: 27a1722d517c_1.txt
Processed: 5a0aedfe0f5d_1.txt
Processed: dc1202fd9850_1.txt
Processed: 4b1efc986bb8_1.txt
Processed: b576ac602665_1.txt
Processed: e3c7064319e0_1.txt
Processed: 10a2da5c514c_1.txt
Processed: efc2665e4bed_1.txt
Processed: 2fc890744f7c_1.txt
Processed: e5e76a8efa0f_1.txt
Processed: 17fbae84faea_1.txt
Processed: 15238b87db03_1.txt
Processed: b019e9b8d51d_1.txt
Processed: d07dfb774c28_1.txt
Processed: 5710814a4a84_1.txt
Processed: f495f41dfa02_1.txt
list index out of range :::: /Users/linndfors/study/diploma/ner_for_fem/data/DATA_FOR_BALANCED_NER/bruk_changed/17d3d678df81_1.txt
Processed: e023d6d6f1a3_1.txt
Processed: 548f66f38090_1.txt
Processed: a196488351f6_1.txt
Processed: c0c436cc9cb6_1.txt
Processed: 232be2c6b5c2_1.txt
Processed: 1c325fde9ed4_1.txt
Processed: 2298b6b41bec_1.txt
Processed: a1a77d3d5b33_1.txt