In [1]:
import os
import string
from typing import List, Tuple, Dict

In [2]:
def read_text_file(file_path: str) -> str:
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

In [3]:
def tokenize_text(text: str) -> List[str]:
    # Tokenize the text into a list of words
    tokens = []
    for sentence in text.split('\n'):
        for word in sentence.split():
            # Remove trailing punctuation marks from the word
            while word and word[-1] in string.punctuation:
                word = word[:-1]
            tokens.append(word)
    return tokens
    # tokens = [word for sentence in text.split('\n') for word in sentence.split()]
    # return tokens

In [4]:
def get_start_end_range_to_token_index(text, tokens, entity_ranges):
    # Initialize a dictionary to map each (start, end) range to the corresponding token indices
    start_end_range_to_token_index = {}
    # Keep track of the current position in the text
    current_pos = 0
    # Iterate over each token in the tokens list
    for i in range(len(tokens)):
        # Calculate the starting position of the token
        token_start = text.find(tokens[i], current_pos)
        token_end = token_start + len(tokens[i])
        # Update the current position in the text
        current_pos = token_end
        # Check if the current token is inside any of the entity ranges
        for label, start, end in entity_ranges:
            if start <= token_start and end >= token_end:
                # If the (start, end) range is not already in the dictionary, add it with an empty list
                if (start, end) not in start_end_range_to_token_index:
                    start_end_range_to_token_index[(start, end)] = []
                # Add the index of the token to the list corresponding to the (start, end) range in the dictionary
                start_end_range_to_token_index[(start, end)].append(i)
    return start_end_range_to_token_index


In [5]:
def read_annotation_file(file_path: str, selected_entities: List[str]) -> List[Tuple[str, List[Tuple[int,int]]]]:
    entity_ranges = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            fields = line.strip().split('\t')
            # Get the tag and its starting and ending positions
            if fields[0].startswith('T'):
                entity_with_range, word = fields[1], fields[2]
                label = entity_with_range.split()[0]
                if label in selected_entities:
                    ranges = [
                        (
                            int(start_end.split()[0]),
                            int(start_end.split()[1])
                        )
                        for start_end in ' '.join(entity_with_range.split()[1:]).split(';')
                    ]
                    entity_ranges.append((label, ranges))
    # Sort the entity ranges based on start and end
    entity_ranges = sorted(entity_ranges, key=lambda x: (x[1][0][0], x[1][0][1]))
    return entity_ranges

In [6]:
def convert_ann_to_bio(input_dir: str, output_dir: str, selected_entities: List[str]):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for file_name in os.listdir(input_dir):
        if file_name.endswith('.txt'):
            # Read the corresponding txt file
            text = read_text_file(os.path.join(input_dir, file_name))

            # Find the corresponding ann file
            ann_file = os.path.join(input_dir, file_name.replace('.txt', '.ann'))

            # Tokenize the text
            tokens = tokenize_text(text)

            # Initialize a list to hold the BIO-formatted tags
            bio_tags = ['O'] * len(tokens)

            # Read the annotation file
            entity_ranges = read_annotation_file(ann_file, selected_entities)
            entity_ranges = [(name, *tup) for name, tup_list in entity_ranges for tup in tup_list]

            start_end_2_idx = get_start_end_range_to_token_index(text, tokens, entity_ranges)

            # Update the BIO tags
            for label, start, end in entity_ranges:
                # Get the list of token indices corresponding to the (start, end) range
                token_indices = start_end_2_idx.get((start, end), [])
                # Assign the BIO tags to each token index in the range
                for i in token_indices:
                    if i == token_indices[0]:
                        bio_tags[i] = 'B-' + label
                    else:
                        bio_tags[i] = 'I-' + label

            # Write the BIO tags to a new file
            with open(os.path.join(output_dir, file_name.replace('.txt', '.bio')), 'w', encoding='utf-8') as f:
                sentence_start_index = 0
                for sentence in text.split('\n'):
                    sentence_tokens = sentence.split()
                    sentence_length = len(sentence_tokens)
                    sentence_end_index = sentence_start_index + sentence_length
                    for i in range(sentence_start_index, sentence_end_index):
                        f.write(sentence_tokens[i - sentence_start_index] + '\t' + bio_tags[i] + '\n')
                    f.write('\n')
                    sentence_start_index = sentence_end_index

    print("Conversion completed successfully.")


In [7]:
selected_entities = ['Age', 'Biological_attribute', 'Biological_structure', 'Clinical_event', 'Diagnostic_procedure', 'Disease_disorder', 'Dosage', 'Family_history', 'Height', 'History', 'Lab_value', 'Mass', 'Medication', 'Sex', 'Sign_symptom', 'Therapeutic_procedure', 'Weight']

In [8]:
convert_ann_to_bio('./data/MACCROBAT', './data/BIO_FILES', selected_entities)

Conversion completed successfully.
