In [None]:
import pandas as pd
train = '/content/NER_Irish_train.conll'
val = '/content/NER_Irish_validation.conll'
test = '/content/NER_Irish_test.conll'

In [None]:
!pip install transformers
!pip install tokenizers



In [None]:
def read_conll_file(file_path):
    data = []
    current_sentence = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line.startswith('-DOCSTART-'):
                continue
            if line:
                parts = line.split()
                word = parts[0]
                ner_label = parts[-1]
                current_sentence.append((word, ner_label))
            else:
                if current_sentence:
                    data.append(current_sentence)
                    current_sentence = []
    if current_sentence:
        data.append(current_sentence)
    return data

train_data = read_conll_file(train)
val_data = read_conll_file(val)
test_data = read_conll_file(test)

In [None]:
print(train_data)

[[('\ufeffDáil', 'B-ORG'), ('Éireann', 'I-ORG'), ('06', 'O'), ('/', 'O'), ('07', 'O'), ('/', 'O'), ('2023', 'O')], [('Baineann', 'O'), ('mo', 'O'), ('cheist', 'O'), ('le', 'O'), ('cúrsaí', 'O'), ('tithíochta', 'O'), ('i', 'O'), ('nGaillimh', 'B-LOC'), ('-', 'O'), ('tá', 'O'), ('sé', 'O'), ('níos', 'O'), ('cirte', 'O'), ('easpa', 'O'), ('tithíochta', 'O'), ('i', 'O'), ('nGaillimh', 'B-LOC'), ('a', 'O'), ('rá', 'O'), ('-', 'O'), ('agus', 'O'), ('an', 'O'), ('tascfhórsa', 'O'), ('a', 'O'), ('bunaíodh', 'O'), ('breis', 'O'), ('agus', 'O'), ('ceithre', 'O'), ('bliana', 'O'), ('ó', 'O'), ('shin', 'O')], [('Ar', 'O'), ('dheis', 'O'), ('Dé', 'B-PER'), ('go', 'O'), ('raibh', 'O'), ('a', 'O'), ('anam', 'O')], [('Táim', 'O'), ('os', 'O'), ('comhair', 'O'), ('Dail', 'B-ORG'), ('Éireann', 'I-ORG')], [('Ba', 'O'), ('mhaith', 'O'), ('liom', 'O'), ('tréaslú', 'O'), ('leis', 'O'), ('an', 'O'), ('Teachta', 'O'), ('Andrews', 'B-PER'), ('as', 'O'), ('an', 'O'), ('obair', 'O'), ('atá', 'O'), ('déanta', 'O'

In [None]:
print(val_data)

In [None]:
print(test_data)

[[('Crothnófar', 'O'), ('Pól', 'B-PER'), ('nó', 'O'), ('ba', 'O'), ('úrlabhraí', 'O'), ('maith', 'O'), ('é', 'O'), ('ar', 'O'), ('Raidió', 'B-ORG'), ('na', 'I-ORG'), ('Gaeltachta', 'I-ORG'), ('agus', 'O'), ('na', 'O'), ('meáin', 'O'), ('eile', 'O'), ('ag', 'O'), ('cosaint', 'O'), ('na', 'O'), ('n-oifigí', 'O'), ('poist', 'O'), ('tuaithe', 'O'), (',', 'O'), ('gné', 'O'), ('am-tábhachtach', 'O'), ('de', 'O'), ('shaol', 'O'), ('sóisialta', 'O'), ('na', 'O'), ('ndaoine', 'O'), ('.', 'O')], [('Dá', 'O'), ('mba', 'O'), (',', 'O'), ('mar', 'O'), ('sin', 'O'), (',', 'O'), ('nach', 'O'), ('bhfaca', 'O'), ('an', 'O'), ('tSiúr', 'O'), ('Concepta', 'B-PER'), ('an', 'O'), ('tréadaí', 'O'), ('Eoin', 'B-PER'), ('Mac', 'I-PER'), ('Diarmada', 'I-PER'), ('ag', 'O'), ('bailiú', 'O'), ('leis', 'O'), ('ag', 'O'), ('an', 'O'), ('mbreacsholas', 'O'), (',', 'O'), ('níorbh', 'O'), ('aistí', 'O'), ('rud', 'O'), ('riamh', 'O'), ('ná', 'O'), ('an', 'O'), ('radharc', 'O'), ('a', 'O'), ('bhí', 'O'), ('le', 'O'), ('

In [None]:
import torch
from transformers import AutoTokenizer

# Enocde the labels
def encode_labels(sentence, tokenizer, label_map):
    tokens = []
    encoded_labels = []
    word_indices = []

    for word, label in sentence:
        word_tokens = tokenizer.tokenize(word)
        word_indices.append(len(tokens))  # Track the index of the original word
        tokens.extend(word_tokens)
        # Assign label to each sub-token
        for _ in word_tokens:
            encoded_labels.append(label_map[label])

    return tokens, encoded_labels, word_indices

def prepare_bert_inputs(data):
    tokenizer = AutoTokenizer.from_pretrained('DCU-NLP/bert-base-irish-cased-v1')
    bert_inputs = []
    max_seq_length = 256  # Defining the maximum sequence length

    # Dictionary of labels mapped to integer representations
    label_map = {
        'O': 0,
        'B-PER': 1,
        'I-PER': 2,
        'B-LOC': 3,
        'I-LOC': 4,
        'B-ORG': 5,
        'I-ORG': 6
    }

    for sentence in data:
        tokens, encoded_labels, word_indices = encode_labels(sentence, tokenizer, label_map)

        # Padding the sequences
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        padding_length = max_seq_length - len(tokens)
        tokens += [tokenizer.pad_token] * padding_length

        # 0 corresponds to 'O' tag
        encoded_labels = [0] + encoded_labels + [0]

        # Adjust encoded_labels based on original word indices
        adjusted_labels = [encoded_labels[idx] for idx in word_indices]
        adjusted_labels += [0] * (max_seq_length - len(adjusted_labels))

        # Converting the tokens to input IDs and creating attention mask
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)

        # Padding the attention_mask
        padding_length = max_seq_length - len(attention_mask)
        attention_mask += [0] * padding_length

        # Converting adjusted_labels to tensor
        adjusted_labels = torch.tensor(adjusted_labels, dtype=torch.long)

        # Appending the data to bert_inputs in a dict format
        bert_inputs.append({
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': adjusted_labels
        })

    return bert_inputs

In [None]:
train_inputs = prepare_bert_inputs(train_data)

In [None]:
print(train_data[12])
print(train_inputs[12])

[('Ní', 'O'), ('féidir', 'O'), ('linn', 'O'), ('muinín', 'O'), ('a', 'O'), ('bheith', 'O'), ('againn', 'O'), ('as', 'O'), ('an', 'O'), ('eolas', 'O'), ('a', 'O'), ('thagann', 'O'), ('ó', 'O'), ('mbord', 'O'), ('feidhmiúcháin', 'O'), ('Nike', 'B-ORG'), ('agus', 'O'), ('tá', 'O'), ('feidhmeannaigh', 'O'), ('an', 'O'), ('bhoird', 'O'), ('ag', 'O'), ('caitheamh', 'O'), ('go', 'O'), ('holc', 'O'), ('leis', 'O'), ('an', 'O'), ('bpobal', 'O'), ('.', 'O')]
{'input_ids': [102, 421, 450, 739, 5735, 105, 378, 1152, 343, 115, 1134, 105, 2492, 228, 8314, 8829, 22096, 138, 308, 26003, 115, 10440, 124, 4107, 173, 17568, 239, 115, 4447, 118, 103, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

# Testing Augmentation

In [None]:
train_data[0:10]

[[('\ufeffDáil', 'B-ORG'),
  ('Éireann', 'I-ORG'),
  ('06', 'O'),
  ('/', 'O'),
  ('07', 'O'),
  ('/', 'O'),
  ('2023', 'O')],
 [('Baineann', 'O'),
  ('mo', 'O'),
  ('cheist', 'O'),
  ('le', 'O'),
  ('cúrsaí', 'O'),
  ('tithíochta', 'O'),
  ('i', 'O'),
  ('nGaillimh', 'B-LOC'),
  ('-', 'O'),
  ('tá', 'O'),
  ('sé', 'O'),
  ('níos', 'O'),
  ('cirte', 'O'),
  ('easpa', 'O'),
  ('tithíochta', 'O'),
  ('i', 'O'),
  ('nGaillimh', 'B-LOC'),
  ('a', 'O'),
  ('rá', 'O'),
  ('-', 'O'),
  ('agus', 'O'),
  ('an', 'O'),
  ('tascfhórsa', 'O'),
  ('a', 'O'),
  ('bunaíodh', 'O'),
  ('breis', 'O'),
  ('agus', 'O'),
  ('ceithre', 'O'),
  ('bliana', 'O'),
  ('ó', 'O'),
  ('shin', 'O')],
 [('Ar', 'O'),
  ('dheis', 'O'),
  ('Dé', 'B-PER'),
  ('go', 'O'),
  ('raibh', 'O'),
  ('a', 'O'),
  ('anam', 'O')],
 [('Táim', 'O'),
  ('os', 'O'),
  ('comhair', 'O'),
  ('Dail', 'B-ORG'),
  ('Éireann', 'I-ORG')],
 [('Ba', 'O'),
  ('mhaith', 'O'),
  ('liom', 'O'),
  ('tréaslú', 'O'),
  ('leis', 'O'),
  ('an', 'O'),
  ('

In [None]:
import random

def augment_sentence_with_entities_v3(sentence, entity_pool, num_entities=1):
    augmented_sentence = sentence[:]  # Make a copy of the sentence to preserve the original data.

    # Determine insertion positions excluding the last position unless it's 'O'.
    insert_positions = [i for i, (_, label) in enumerate(sentence[:-1]) if label == 'O' and sentence[i+1][1] == 'O']
    if sentence[-1][1] == 'O':
        insert_positions.append(len(sentence)-1)  # Include the last position if it's 'O'.

    for _ in range(num_entities):
        if not insert_positions:
            break  # Exit if no suitable position is found.

        insert_position = random.choice(insert_positions)
        # Select a random 'B-' tag entity to insert.
        b_tag_entities = {k: v for k, v in entity_pool.items() if k.startswith('B-')}
        entity_label = random.choice(list(b_tag_entities.keys()))
        entity_to_insert = random.choice(b_tag_entities[entity_label])
        entity_tokens = entity_to_insert.split()

        # Insert the 'B-' entity at the chosen position.
        for token in entity_tokens:
            augmented_sentence.insert(insert_position + 1, (token, entity_label))
            insert_position += 1  # Update the position for the next token insertion.

        i_tag_tokens = []  # Initialise the list to avoid the NameError.
        # Optionally insert 'I-' tags following the 'B-' tag, ensuring logical consistency.
        i_tag_label = 'I' + entity_label[1:]  # Convert 'B-' tag to 'I-' tag.
        if i_tag_label in entity_pool and entity_pool[i_tag_label]:
            i_tag_entity = random.choice(entity_pool[i_tag_label])
            i_tag_tokens = i_tag_entity.split()
            for token in i_tag_tokens:
                # Ensure the insertion follows logical consistency rules.
                if insert_position < len(augmented_sentence) - 1 and (augmented_sentence[insert_position + 1][1] == 'O' or augmented_sentence[insert_position + 1][1].startswith('I-')):
                    augmented_sentence.insert(insert_position + 1, (token, i_tag_label))
                    insert_position += 1

        # Update insert positions to avoid clustering inserted entities.
        insert_positions = [pos for pos in insert_positions if pos < insert_position or pos > insert_position + len(entity_tokens) + len(i_tag_tokens)]

    return augmented_sentence



In [None]:
entity_pool = {}
i = 0
while i < len(train_data):
  for word, label in train_data[i]:
    if label != 'O':
        if label not in entity_pool:
            entity_pool[label] = []
        if word not in entity_pool[label]:
            entity_pool[label].append(word)
  i += 1  # Increment i inside the while loop

print(entity_pool)
print(len(entity_pool['B-PER']))
print(len(entity_pool['I-PER']))
print(len(entity_pool['B-ORG']))
print(len(entity_pool['I-ORG']))
print(len(entity_pool['B-LOC']))
print(len(entity_pool['I-LOC']))



{'B-ORG': ['\ufeffDáil', 'Dail', 'Dáil', 'Fís', 'RTÉ', 'TG4', 'Údarás', 'Nike', 'an', 'Sky', 'Dála', 'Sinn', 'An', 'HSE', 'Raidió', 'Sláintecare', 'Chumann', 'gComhaontas', 'Fianna', 'Fine', 'd’Fhianna', 'Teach', 'Roinne', 'Cúla4', 'd’Údarás', 'DEIS', 'Chonradh', 'Massie', 'New', 'John', 'bhForas', '“Irish', 'VirginMedia', 'S4C', 'CLG', 'gCumann', 'Fórsa', 'SIPTU', 'INMO', 'Enable', 'Tusla', 'FSS', 'Thithe', 'Foras', 'gComhairle', 'CATT', 'Conradh', 'Glór', 'Gael', 'Údaras', 'gComhchoiste', 'Comhchoiste', 'Comhairle', 'Chomhairle', 'Rannóg', 'Cluichí', 'Cumann', 'Citylink', 'Coláiste', 'Seanad', 'Choláiste', 'Rialtas', 'Aontas', 'Gharda', 'Banc', 'Fhine', 'Fhianna', 'hArd-Chúirte', 'Oireachtas', 'Oireachtais', 'Grant', 'Choiste', 'Chomhchoiste', 'Roinn', "'Seirbhísí", 'Comhdháil', 'Cúldoras', 'Acadamh', 'FBI', 'Chlub', 'Tia', 'Oifig', 'INTRA', 'gColáiste', 'College', 'hÉireannaigh', 'hArm', 'BBC', 'Globalise', 'gCuntas', 'Thoghroinn', 'Náisiún', 'Faisean', 'Éire', 'Scoil', 'ECU', 'ANOI

In [None]:
import random

def replace_and_adjust_entities(sentence, words_to_replace, entity_pool):
    updated_sentence = []
    for i, (word, label) in enumerate(sentence):
        if word in words_to_replace:
            # Choose a random 'B-PER' entity to replace
            b_per_entity = random.choice(entity_pool['B-PER']).split()
            for j, entity_word in enumerate(b_per_entity):
                # Use 'B-PER' for the first token, 'I-PER' for subsequent tokens
                new_label = 'I-PER' if j > 0 else 'B-PER'
                updated_sentence.append((entity_word, new_label))
        else:
            updated_sentence.append((word, label))

    # Adjust consecutive entities
    adjusted_sentence = []
    previous_label = None
    for word, label in updated_sentence:
        if label == 'B-PER' and previous_label == 'B-PER':
            adjusted_label = 'I-PER'
        else:
            adjusted_label = label
        adjusted_sentence.append((word, adjusted_label))
        previous_label = label

    return adjusted_sentence

# Simple example
sentence = [('Sí', 'O'), ('Sé', 'O'), ('to', 'O'), ('the', 'O'), ('store', 'O'), ('and', 'O'), ('Sé', 'O'), ('followed', 'O')]
words_to_replace = ['Sí', 'Sé']
updated_and_adjusted_sentence = replace_and_adjust_entities(sentence, words_to_replace, entity_pool)
print(updated_and_adjusted_sentence)


[('Sinéad', 'B-PER'), ("O'Neill", 'I-PER'), ('to', 'O'), ('the', 'O'), ('store', 'O'), ('and', 'O'), ('M.', 'B-PER'), ('followed', 'O')]


In [None]:
def adjust_consecutive_entities(sentence):
    adjusted_sentence = []
    previous_label = None  # Keep track of the previous label

    for word, label in sentence:
        if label == 'B-PER' and previous_label == 'B-PER':
            # Change to 'I-PER' if the previous label was also 'B-PER'
            adjusted_label = 'I-PER'
        else:
            adjusted_label = label

        adjusted_sentence.append((word, adjusted_label))
        previous_label = label  # Update the previous label for the next iteration

    return adjusted_sentence

# Example
sentence = [
    ('Micheal', 'B-PER'), ('Briain', 'B-PER'), ('to', 'O'), ('the', 'O'),
    ('store', 'O'), ('and', 'O'), ('Ard-Reachtaire', 'B-PER'), ('followed', 'O')
]
adjusted_sentence = adjust_consecutive_entities(sentence)
print(adjusted_sentence)


[('Micheal', 'B-PER'), ('Briain', 'I-PER'), ('to', 'O'), ('the', 'O'), ('store', 'O'), ('and', 'O'), ('Ard-Reachtaire', 'B-PER'), ('followed', 'O')]


In [None]:
augmented_train_data_v3 = [augment_sentence_with_entities_v3(sentence, entity_pool, num_entities=1) for sentence in train_data]
#train_inputs_v2 = prepare_bert_inputs(augmented_train_data_v3, tokenizer, label_map)

print(train_data[35])
print(augmented_train_data_v3[35])

[('Tagann', 'O'), ('an', 'O'), ('chéad', 'O'), ('cheist', 'O'), ('ón', 'O'), ('Teachta', 'O'), ('Ó', 'B-PER'), ('Cuív', 'I-PER'), ('.', 'O')]
[('Tagann', 'O'), ('an', 'O'), ('chéad', 'O'), ('Thoraí', 'B-ORG'), ('le', 'I-ORG'), ('cheist', 'O'), ('ón', 'O'), ('Teachta', 'O'), ('Ó', 'B-PER'), ('Cuív', 'I-PER'), ('.', 'O')]


# Testing on Data

In [None]:
train_data[0]

[('\ufeffDáil', 'B-ORG'),
 ('Éireann', 'I-ORG'),
 ('06', 'O'),
 ('/', 'O'),
 ('07', 'O'),
 ('/', 'O'),
 ('2023', 'O')]

In [None]:
augmented_train_data_v3 = [augment_sentence_with_entities_v3(sentence, entity_pool, num_entities=1) for sentence in train_data]
train_inputs_v3 = prepare_bert_inputs(augmented_train_data_v3)


print(train_data[0])
print(augmented_train_data_v3[0])
print(train_inputs_v3[2])

[('\ufeffDáil', 'B-ORG'), ('Éireann', 'I-ORG'), ('06', 'O'), ('/', 'O'), ('07', 'O'), ('/', 'O'), ('2023', 'O')]
[('\ufeffDáil', 'B-ORG'), ('Éireann', 'I-ORG'), ('06', 'O'), ('/', 'O'), ('hArm', 'B-ORG'), ('Forbairt', 'I-ORG'), ('07', 'O'), ('/', 'O'), ('2023', 'O')]
{'input_ids': [102, 475, 4854, 794, 173, 408, 5932, 20438, 105, 5863, 103, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
eval_inputs = prepare_bert_inputs(val_data)
test_inputs = prepare_bert_inputs(test_data)

In [None]:
!pip install accelerate -U



In [None]:
!pip install transformers[torch]



In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments
import torch

model_name = 'DCU-NLP/bert-base-irish-cased-v1'
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=7)

# Load the tokeniser
tokenizer = AutoTokenizer.from_pretrained(model_name)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
# Checking for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Moving the model to device to use GPU
model.to(device)


from sklearn.metrics import accuracy_score, classification_report

# Defining compute_metrics function for the evaluation metrics I want to see
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    acc = accuracy_score(labels.flatten(), preds.flatten())
    report = classification_report(labels.flatten(), preds.flatten(), zero_division=0)
    return {
        'accuracy': acc,
        'classification_report': report
    }

 # Defining the training arguments to be used for the Trainer
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer",evaluation_strategy="epoch")

from transformers import Trainer

# Passing the model to be trained and the data for training and evaluation along with the metrics defined before

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_inputs_v3,
    eval_dataset=eval_inputs,
    compute_metrics=compute_metrics,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DCU-NLP/bert-base-irish-cased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
print(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_

In [None]:
 # Training the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Classification Report
1,No log,0.046402,0.985117,precision recall f1-score support  0 0.99 1.00 0.99 25182  1 0.47 0.14 0.22 64  2 0.49 0.25 0.33 83  3 0.54 0.12 0.20 58  4 0.62 0.22 0.32 37  5 0.80 0.05 0.10 75  6 0.55 0.26 0.35 101  accuracy 0.99 25600  macro avg 0.64 0.29 0.36 25600 weighted avg 0.98 0.99 0.98 25600
2,No log,0.041525,0.984961,precision recall f1-score support  0 0.99 1.00 0.99 25182  1 0.44 0.27 0.33 64  2 0.50 0.45 0.47 83  3 0.59 0.22 0.32 58  4 0.37 0.49 0.42 37  5 0.52 0.20 0.29 75  6 0.44 0.31 0.36 101  accuracy 0.98 25600  macro avg 0.55 0.42 0.46 25600 weighted avg 0.98 0.98 0.98 25600
3,No log,0.040114,0.985,precision recall f1-score support  0 0.99 1.00 0.99 25182  1 0.46 0.25 0.32 64  2 0.53 0.37 0.44 83  3 0.64 0.24 0.35 58  4 0.40 0.49 0.44 37  5 0.41 0.21 0.28 75  6 0.39 0.43 0.41 101  accuracy 0.98 25600  macro avg 0.55 0.43 0.46 25600 weighted avg 0.98 0.98 0.98 25600


Trainer is attempting to log a value of "              precision    recall  f1-score   support

           0       0.99      1.00      0.99     25182
           1       0.47      0.14      0.22        64
           2       0.49      0.25      0.33        83
           3       0.54      0.12      0.20        58
           4       0.62      0.22      0.32        37
           5       0.80      0.05      0.10        75
           6       0.55      0.26      0.35       101

    accuracy                           0.99     25600
   macro avg       0.64      0.29      0.36     25600
weighted avg       0.98      0.99      0.98     25600
" of type <class 'str'> for key "eval/classification_report" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "              precision    recall  f1-score   support

           0       0.99      1.00      0.99     25182
           1       0.44      0.27      0.33

TrainOutput(global_step=378, training_loss=0.08052805744150959, metrics={'train_runtime': 81.9885, 'train_samples_per_second': 36.81, 'train_steps_per_second': 4.61, 'total_flos': 394314629981184.0, 'train_loss': 0.08052805744150959, 'epoch': 3.0})

In [None]:
from torch.utils.data import DataLoader, TensorDataset

# Transforming the test_inputs to be evaluated by the model

test_input_ids = [example['input_ids'] for example in test_inputs]
test_attention_masks = [example['attention_mask'] for example in test_inputs]
test_labels = [example['labels'] for example in test_inputs]

# Converting the data into PyTorch tensors
test_input_ids = torch.tensor(test_input_ids)
test_attention_masks = torch.tensor(test_attention_masks)
test_labels = torch.stack(test_labels)

# Creating a DataLoader for the test data
batch_size = 32
test_dataset = TensorDataset(test_input_ids, test_attention_masks, test_labels)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

# Setting the model to evaluation mode
model.eval()

predicted_labels = []
true_labels = []

# Iterate through the test_dataloader to get the predictions
# Using the GPU
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

    # Disabling gradient calculation for evaluation
    with torch.no_grad():
        outputs = model(**inputs)

    # Extracting the predictions and also the true labels for the test data
    predicted_labels.extend(outputs.logits.argmax(dim=-1).tolist())
    true_labels.extend(inputs['labels'].tolist())

# Flatten the labels
flat_predicted_labels = [label for sublist in predicted_labels for label in sublist]
flat_true_labels = [label for sublist in true_labels for label in sublist]

# Calculating the accuracy
accuracy = accuracy_score(flat_true_labels, flat_predicted_labels)
print(f"Accuracy: {accuracy}")

# Generating the classification report
class_report = classification_report(flat_true_labels, flat_predicted_labels)
print("Classification Report:")
print(class_report)

Accuracy: 0.9815569196428572
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     35133
           1       0.39      0.23      0.29       108
           2       0.40      0.41      0.40       111
           3       0.38      0.15      0.21       124
           4       0.50      0.46      0.48       137
           5       0.56      0.20      0.29        95
           6       0.45      0.52      0.48       132

    accuracy                           0.98     35840
   macro avg       0.53      0.42      0.45     35840
weighted avg       0.98      0.98      0.98     35840



# Evaluation

In [None]:
tokenizer = AutoTokenizer.from_pretrained('DCU-NLP/bert-base-irish-cased-v1', do_lower_case=False)

In [None]:
tag_values = [
        'O',
        'B-PER',
        'I-PER',
        'B-LOC',
        'I-LOC',
        'B-ORG',
        'I-ORG'
]

tag_values.append("PAD")
tag2idx = {t: i for i, t in enumerate(tag_values)}

In [None]:
tag2idx

{'O': 0,
 'B-PER': 1,
 'I-PER': 2,
 'B-LOC': 3,
 'I-LOC': 4,
 'B-ORG': 5,
 'I-ORG': 6,
 'PAD': 7}

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import BertConfig

In [None]:
MAX_LEN = 256
bs = 32

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [None]:
torch.cuda.get_device_name(0)

'NVIDIA L4'

In [None]:
train_sent = []

for lst in augmented_train_data_v3:
  sent = []
  for words in lst:
    sent.append(words[0])
  train_sent.append(sent)

In [None]:
val_sent = []

for lst in val_data:
  sent = []
  for words in lst:
    sent.append(words[0])
  val_sent.append(sent)

In [None]:
test_sent = []

for lst in test_data:
  sent = []
  for words in lst:
    sent.append(words[0])
  test_sent.append(sent)

In [None]:
train_labels = []

for lst in augmented_train_data_v3:
  lab = []
  for words in lst:
    lab.append(words[1])
  train_labels.append(lab)

In [None]:
val_labels = []

for lst in val_data:
  lab = []
  for words in lst:
    lab.append(words[1])
  val_labels.append(lab)

In [None]:
test_labels = []

for lst in test_data:
  lab = []
  for words in lst:
    lab.append(words[1])
  test_labels.append(lab)

In [None]:
print(train_sent[1])

['Baineann', 'mo', 'cheist', 'le', 'cúrsaí', 'tithíochta', 'i', 'nGaillimh', '-', 'tá', 'sé', 'níos', 'cirte', 'easpa', 'tithíochta', 'i', 'nGaillimh', 'a', 'rá', '-', 'agus', 'an', 'tascfhórsa', 'a', 'bunaíodh', 'breis', 'agus', 'ceithre', 'Chúirt', "'", 'bliana', 'ó', 'shin']


In [None]:
print(train_labels[1])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'I-ORG', 'O', 'O', 'O']


In [None]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [None]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(train_sent, train_labels)
]

val_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(val_sent, val_labels)
]

test_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(test_sent, test_labels)
]

In [None]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

val_tokenized_texts = [token_label_pair[0] for token_label_pair in val_tokenized_texts_and_labels]
val_labels = [token_label_pair[1] for token_label_pair in val_tokenized_texts_and_labels]

test_tokenized_texts = [token_label_pair[0] for token_label_pair in test_tokenized_texts_and_labels]
test_labels = [token_label_pair[1] for token_label_pair in test_tokenized_texts_and_labels]

In [None]:
print(tokenized_texts)
print(labels)

[['Dáil', 'Éireann', '06', '/', 'hAr', '##m', 'Forbairt', '07', '/', '2023'], ['Baineann', 'mo', 'cheist', 'le', 'cúrsaí', 'tithíochta', 'i', 'nGaillimh', '-', 'tá', 'sé', 'níos', 'cirt', '##e', 'easpa', 'tithíochta', 'i', 'nGaillimh', 'a', 'rá', '-', 'agus', 'an', 'tasc', '##fhórsa', 'a', 'bunaíodh', 'breis', 'agus', 'ceithre', 'Chúirt', "'", 'bliana', 'ó', 'shin'], ['Ar', 'dheis', 'Dé', 'go', 'raibh', 'Mary', 'Donnell', 'a', 'anam'], ['Táim', 'os', 'Parlaiminte', 'Francis', 'comhair', 'Dail', 'Éireann'], ['Ba', 'mhaith', 'liom', 'tré', '##as', '##lú', 'leis', 'City', '##lin', '##k', 'Éireann', 'an', 'Teachta', 'Andrew', '##s', 'as', 'an', 'obair', 'atá', 'déanta', 'aige', 'sa', 'phíosa', 'reachtaíochta', 'seo', 'mar', 'is', 'é', 'a', 'rinne', 'formhór', 'na', 'hoibre', 'uirthi'], ['Gabh', '##aim', 'comhghairdeas', 'Choinbhinsiún', 'Journal', '##ists', 'leis', 'an', 'Aire', 'Stáit'], ['Cuireadh', 'an', 'Dáil', 'ar', 'Ken', '##nac', '##raig', 'Fhionn', '##uisce', 'fionraí', 'ar', '4', 

In [None]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

val_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in val_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

test_input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in test_tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", value=0.0,
                          truncating="post", padding="post")

In [None]:
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

val_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in val_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

test_tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in test_labels],
                     maxlen=MAX_LEN, value=tag2idx["PAD"], padding="post",
                     dtype="long", truncating="post")

In [None]:
attention_masks = [[float(i != 0.0) for i in ii] for ii in input_ids]

val_attention_masks = [[float(i != 0.0) for i in ii] for ii in val_input_ids]

test_attention_masks = [[float(i != 0.0) for i in ii] for ii in test_input_ids]

In [None]:
tr_inputs = torch.tensor(input_ids)
tr_tags = torch.tensor(tags)
tr_masks = torch.tensor(attention_masks)

vl_inputs = torch.tensor(val_input_ids)
vl_tags = torch.tensor(val_tags)
vl_masks = torch.tensor(val_attention_masks)

tst_inputs = torch.tensor(test_input_ids)
tst_tags = torch.tensor(test_tags)
tst_masks = torch.tensor(test_attention_masks)

In [None]:
tr_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(tr_data)
train_dataloader = DataLoader(tr_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(vl_inputs, vl_masks, vl_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

tst_data = TensorDataset(tst_inputs, tst_masks, tst_tags)
tst_sampler = SequentialSampler(tst_data)
tst_dataloader = DataLoader(tst_data, sampler=tst_sampler, batch_size=bs)

In [None]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, Trainer, TrainingArguments, AdamW

In [None]:
model_name = 'DCU-NLP/bert-base-irish-cased-v1'
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at DCU-NLP/bert-base-irish-cased-v1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model.cuda();

In [None]:
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

optimizer = AdamW(
    optimizer_grouped_parameters,
    lr=3e-5,
    eps=1e-8
)



In [None]:
!pip install seqeval

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16161 sha256=0ef890b9067b5267fee34cc5196bfc2d3afc94f4f6ed96fb0c6d520f68947e72
  Stored in directory: /root/.cache/pip/wheels/1a/67/4a/ad4082dd7dfc30f2abfe4d80a2ed5926a506eb8a972b4767fa
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [None]:
from seqeval.metrics import f1_score, accuracy_score

In [None]:
from tqdm import tqdm, trange

import numpy as np

In [None]:
from transformers import get_linear_schedule_with_warmup

epochs = 10
max_grad_norm = 1.0

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

In [None]:
## Store the average loss after each epoch so we can plot them.
loss_values, validation_loss_values = [], []

for _ in trange(epochs, desc="Epoch"):
    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.

    # Put the model into training mode.
    model.train()
    # Reset the total loss for this epoch.
    total_loss = 0

    # Training loop
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        # Always clear any previously calculated gradients before performing a backward pass.
        model.zero_grad()
        # forward pass
        # This will return the loss (rather than the model output)
        # because we have provided the `labels`.
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)


    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.

    # Put the model into evaluation mode
    model.eval()
    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients,
        # saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have not provided labels.
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)
        # Move logits and labels to CPU
        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Calculate the accuracy for this batch of test sentences.
        eval_loss += outputs[0].mean().item()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)
    validation_loss_values.append(eval_loss)
    print("Validation loss: {}".format(eval_loss))
    pred_tags = [tag_values[p_i] for p, l in zip(predictions, true_labels)
                                 for p_i, l_i in zip(p, l) if tag_values[l_i] != "PAD"]
    valid_tags = [tag_values[l_i] for l in true_labels
                                  for l_i in l if tag_values[l_i] != "PAD"]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))
    print()


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Average train loss: 0.3662132571917027


Epoch:  10%|█         | 1/10 [00:22<03:23, 22.66s/it]

Validation loss: 0.06192061398178339
Validation Accuracy: 0.8524490416793429

Average train loss: 0.0683353201020509


Epoch:  20%|██        | 2/10 [00:45<03:03, 22.89s/it]

Validation loss: 0.032780269626528025
Validation Accuracy: 0.9248554913294798

Average train loss: 0.040204624296166


Epoch:  30%|███       | 3/10 [01:08<02:39, 22.84s/it]

Validation loss: 0.026730894576758146
Validation Accuracy: 0.9391542439914816

Average train loss: 0.026867525069974363


Epoch:  40%|████      | 4/10 [01:31<02:16, 22.73s/it]

Validation loss: 0.02383534237742424
Validation Accuracy: 0.9476726498326742

Average train loss: 0.01964086652151309


Epoch:  50%|█████     | 5/10 [01:53<01:53, 22.67s/it]

Validation loss: 0.02326664631254971
Validation Accuracy: 0.9485853361728019

Average train loss: 0.015244170528603718


Epoch:  60%|██████    | 6/10 [02:16<01:30, 22.70s/it]

Validation loss: 0.024646739941090345
Validation Accuracy: 0.9476726498326742

Average train loss: 0.012705726563581266


Epoch:  70%|███████   | 7/10 [02:39<01:08, 22.73s/it]

Validation loss: 0.026339899748563766
Validation Accuracy: 0.9488895649528445

Average train loss: 0.010916911283857189


Epoch:  80%|████████  | 8/10 [03:01<00:45, 22.73s/it]

Validation loss: 0.026104789692908525
Validation Accuracy: 0.9482811073927594

Average train loss: 0.009463667825912125


Epoch:  90%|█████████ | 9/10 [03:24<00:22, 22.72s/it]

Validation loss: 0.025600660126656294
Validation Accuracy: 0.9501064800730149

Average train loss: 0.008805053366813809


Epoch: 100%|██████████| 10/10 [03:47<00:00, 22.73s/it]

Validation loss: 0.02645765244960785
Validation Accuracy: 0.9494980225129297






In [None]:
model.eval()

predicted_labels = []
true_labels = []

# Iterate through the test_dataloader to get the predictions
# Using the GPU
for batch in tst_dataloader:
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

    # Disabling gradient calculation for evaluation
    with torch.no_grad():
        outputs = model(**inputs)

    # Extracting the predictions and also the true labels for the test data
    predicted_labels.extend(outputs.logits.argmax(dim=-1).tolist())
    true_labels.extend(inputs['labels'].tolist())

In [None]:
print(predicted_labels)
print(true_labels)

[[0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7], [0, 3, 3, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 

In [None]:
i = 0
while i < len(true_labels):
  if len(true_labels[i]) != len(predicted_labels[i]):
    print(f'issue: {i}, {len(true_labels[i])}, {len(predicted_labels[i])}')
    print(f'{true_labels[i]}\n{predicted_labels[i]}')
    i += 1
  i += 1

In [None]:
preds = []
trues = []

for sentences in true_labels:
  t = []
  for vals in sentences:
    if vals != 7:
      t.append(vals)
  trues.append(t)

for sentences in predicted_labels:
  p = []
  for vals in sentences:
    if vals != 7:
      p.append(vals)
  preds.append(p)

print(trues)
print(preds)

[[0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 2, 2, 2, 2, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 0, 0, 0, 0, 0, 0, 0, 5, 6, 6, 6, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 0], [0, 0, 0, 0, 0, 0, 1, 2, 2, 0, 0, 3, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 6, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
print(test_tokenized_texts)
print(test_labels)
print(preds)

[['Mar', 'a', 't', '##ch', '##ítear', 'do', 'Sheos', '##amh', 'Mac', 'Grianna', 'é', 'caithfidh', 'an', 't', '-', 'ealaíontóir', 'an', 'solas', 'a', 'thabhairt', 'don', 'saol', 'agus', 'diúltú', 'do', 'chath', '##ú', 'sin', 'na', 'truaill', '##íochta', 'a', 'chuireann', 'an', 'saol', 'ina', 'chos', '##án', '.'], ['Dhiúltaigh', 'John', '##atha', '##n', 'Cape', 'agus', 'cúpla', 'foilsitheoir', 'eile', 'an', 'leabhar', 'a', 'fhoilsiú', 'agus', 'bhí', 'Nao', '##mi', 'le', 'ceangal', 'go', 'raibh', 'srian', 'a', 'chur', 'ar', 'a', 'cuid', 'tuairimíocht', '##a', '.'], ['Grianghraif', 'le', 'Maidhc', 'Ó', 'Seach', '##nas', '##aí', '.'], ['Tagann', 'a', 'ráiteas', 'tar', 'éis', 'don', 'chomhlacht', 'a', 'rá', 'le', 'hoibrithe', 'an', 'iarnróid', 'mí', 'ó', 'shin', 'go', 'raibh', 'sí', 'le', 'dúnadh', ':', "'", 'Chu', '##amar', 'ag', 'cruinniú', 'i', 'mBaile', 'Átha', 'an', 'Rí', 'agus', 'dúirt', 'an', 'bhainistíocht', 'linn', 'ansin', 'go', 'raibh', 'deireadh', 'leis', 'an', 'tseirbhís', 'last

In [None]:
i = 0
while i < len(preds):
  if len(test_labels[i]) != len(preds[i]):
    print(f'issue: {i}, {len(test_tokenized_texts[i])}, {len(test_labels[i])}, {len(preds[i])}')
    print(f'{test_tokenized_texts[i]}\n{test_labels[i]}\n{preds[i]}')
    print()
    i += 1
  i += 1

In [None]:
import pandas as pd

label_map = {
    0: 'O',
    1: 'B-PER',
    2: 'I-PER',
    3: 'B-LOC',
    4: 'I-LOC',
    5: 'B-ORG',
    6: 'I-ORG'
}

# Initialise an empty list to store DataFrames
dfs = []

# Iterate through the tokens, labels, and predictions
for tokens, labels, predictions in zip(test_tokenized_texts, test_labels, preds):
    # Create a DataFrame from the current sublist
    temp_df = pd.DataFrame({'Word': tokens, 'POS': 'X', 'True': labels, 'Predicted': [label_map[pred] for pred in predictions]})
    # Append the DataFrame to the list
    dfs.append(temp_df)
    # Add an empty row as a DataFrame to the list
    dfs.append(pd.DataFrame({'Word': [''], 'POS': [''], 'True': [''], 'Predicted': ['']}))

# Concatenate the DataFrames along the rows axis
df = pd.concat(dfs, ignore_index=True)

print(df)

           Word POS   True Predicted
0           Mar   X      O         O
1             a   X      O         O
2             t   X      O         O
3          ##ch   X      O         O
4       ##ítear   X      O         O
...         ...  ..    ...       ...
4839          a   X      O         O
4840     bhfuil   X      O         O
4841  Gaeltacht   X  B-LOC     B-LOC
4842        ann   X      O         O
4843                                

[4844 rows x 4 columns]


In [None]:
print(df.head(78))

       Word POS   True Predicted
0       Mar   X      O         O
1         a   X      O         O
2         t   X      O         O
3      ##ch   X      O         O
4   ##ítear   X      O         O
..      ...  ..    ...       ...
73   Maidhc   X  B-PER     B-PER
74        Ó   X  I-PER     I-PER
75    Seach   X  I-PER     I-PER
76    ##nas   X  I-PER     I-PER
77     ##aí   X  I-PER     I-PER

[78 rows x 4 columns]


In [None]:
conll_format = ""

for index, row in df.iterrows():
    text = row['Word']
    pos = row['POS']
    tag = row['True']
    mapped_tag = row['Predicted']

    # Append the token in CoNLL format (word, gold_label, predicted_label)
    conll_format += f"{text}\t{pos}\t{tag}\t{mapped_tag}\n"

# Write the CoNLL format string to a text file
with open('/content/gabert_V2_eval.conll', 'w') as f:
    f.write(conll_format)