In [2]:
import pandas as pd
train = '/content/train_final.conll'

def read_conll_file(file_path):
    data = []
    current_sentence = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line.startswith('-DOCSTART-'):
                continue
            if line:
                parts = line.split()
                word = parts[0]
                ner_label = parts[-1]
                current_sentence.append((word, ner_label))
            else:
                if current_sentence:
                    data.append(current_sentence)
                    current_sentence = []
    if current_sentence:
        data.append(current_sentence)
    return data

train_data = read_conll_file(train)

In [3]:
import random

def augment_sentence_with_entities_v3(sentence, entity_pool, num_entities=1):
    augmented_sentence = sentence[:]  # Make a copy of the sentence to preserve the original data.

    # Determine insertion positions excluding the last position unless it's 'O'.
    insert_positions = [i for i, (_, label) in enumerate(sentence[:-1]) if label == 'O' and sentence[i+1][1] == 'O']
    if sentence[-1][1] == 'O':
        insert_positions.append(len(sentence)-1)  # Include the last position if it's 'O'.

    for _ in range(num_entities):
        if not insert_positions:
            break  # Exit if no suitable position is found.

        insert_position = random.choice(insert_positions)
        # Select a random 'B-' tag entity to insert.
        b_tag_entities = {k: v for k, v in entity_pool.items() if k.startswith('B-')}
        entity_label = random.choice(list(b_tag_entities.keys()))
        entity_to_insert = random.choice(b_tag_entities[entity_label])
        entity_tokens = entity_to_insert.split()

        # Insert the 'B-' entity at the chosen position.
        for token in entity_tokens:
            augmented_sentence.insert(insert_position + 1, (token, entity_label))
            insert_position += 1  # Update the position for the next token insertion.

        i_tag_tokens = []  # Initialize the list to avoid the NameError.
        # Optionally insert 'I-' tags following the 'B-' tag, ensuring logical consistency.
        i_tag_label = 'I' + entity_label[1:]  # Convert 'B-' tag to 'I-' tag.
        if i_tag_label in entity_pool and entity_pool[i_tag_label]:
            i_tag_entity = random.choice(entity_pool[i_tag_label])
            i_tag_tokens = i_tag_entity.split()
            for token in i_tag_tokens:
                # Ensure the insertion follows logical consistency rules.
                if insert_position < len(augmented_sentence) - 1 and (augmented_sentence[insert_position + 1][1] == 'O' or augmented_sentence[insert_position + 1][1].startswith('I-')):
                    augmented_sentence.insert(insert_position + 1, (token, i_tag_label))
                    insert_position += 1

        # Update insert positions to avoid clustering inserted entities.
        insert_positions = [pos for pos in insert_positions if pos < insert_position or pos > insert_position + len(entity_tokens) + len(i_tag_tokens)]

    return augmented_sentence

In [4]:
entity_pool = {}
i = 0
while i < len(train_data):
  for word, label in train_data[i]:
    if label != 'O':
        if label not in entity_pool:
            entity_pool[label] = []
        if word not in entity_pool[label]:
            entity_pool[label].append(word)
  i += 1  # Increment i inside the while loop

#print(entity_pool)
print(len(entity_pool['B-PER']))
print(len(entity_pool['I-PER']))
print(len(entity_pool['B-ORG']))
print(len(entity_pool['I-ORG']))
print(len(entity_pool['B-LOC']))
print(len(entity_pool['I-LOC']))

361
415
373
500
338
309


In [5]:
def replace_and_adjust_entities(sentence, words_to_replace, entity_pool):
    updated_sentence = []
    for i, (word, label) in enumerate(sentence):
        if word in words_to_replace:
            # Choose a random 'B-PER' entity to replace
            b_per_entity = random.choice(entity_pool['B-PER']).split()
            for j, entity_word in enumerate(b_per_entity):
                # Use 'B-PER' for the first token, 'I-PER' for subsequent tokens
                new_label = 'I-PER' if j > 0 else 'B-PER'
                updated_sentence.append((entity_word, new_label))
        else:
            updated_sentence.append((word, label))

    # Adjust consecutive entities
    adjusted_sentence = []
    previous_label = None
    for word, label in updated_sentence:
        if label == 'B-PER' and previous_label == 'B-PER':
            adjusted_label = 'I-PER'
        else:
            adjusted_label = label
        adjusted_sentence.append((word, adjusted_label))
        previous_label = label

    return adjusted_sentence

In [6]:
def adjust_consecutive_entities(sentence):
    adjusted_sentence = []
    previous_label = None  # Keep track of the previous label

    for word, label in sentence:
        if label == 'B-PER' and previous_label == 'B-PER':
            # Change to 'I-PER' if the previous label was also 'B-PER'
            adjusted_label = 'I-PER'
        else:
            adjusted_label = label

        adjusted_sentence.append((word, adjusted_label))
        previous_label = label  # Update the previous label for the next iteration

    return adjusted_sentence

In [8]:
augmented_train_data_v3 = [augment_sentence_with_entities_v3(sentence, entity_pool, num_entities=1) for sentence in train_data]
print(train_data[35])
print(augmented_train_data_v3[35])

[('Tagann', 'O'), ('an', 'O'), ('chéad', 'O'), ('cheist', 'O'), ('ón', 'O'), ('Teachta', 'O'), ('Ó', 'B-PER'), ('Cuív', 'I-PER'), ('.', 'O')]
[('Tagann', 'O'), ('hArd-Chúirte', 'B-ORG'), ('Chill', 'I-ORG'), ('an', 'O'), ('chéad', 'O'), ('cheist', 'O'), ('ón', 'O'), ('Teachta', 'O'), ('Ó', 'B-PER'), ('Cuív', 'I-PER'), ('.', 'O')]


In [9]:
file_path = "/content/RDA_training.conll"

with open(file_path, "w", encoding="utf-8") as file:
    for training in train_data:
        for token, tag in training:
            file.write(f"{token}\t{tag}\n")
        file.write("\n")

print(f"Data exported to {file_path}")

Data exported to /content/RDA_training.conll
