In [12]:
MAX_LEN = 256
bs = 32

In [13]:
!pip install transformers
!pip install tokenizers
import pandas as pd



In [14]:
train = '/content/train_final.conll'
val = '/content/NER_Irish_validation.conll'
test = '/content/NER_Irish_test.conll'

In [15]:
def read_conll_file(file_path):
    data = []
    current_sentence = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line.startswith('-DOCSTART-'):
                continue
            if line:
                parts = line.split()
                word = parts[0]
                ner_label = parts[-1]
                current_sentence.append((word, ner_label))
            else:
                if current_sentence:
                    data.append(current_sentence)
                    current_sentence = []
    if current_sentence:
        data.append(current_sentence)
    return data

train_data = read_conll_file(train)
val_data = read_conll_file(val)
test_data = read_conll_file(test)

# Backtranslation

Import the MT Model and Tokeniser

In [21]:
!pip install transformers
!pip install sentencepiece
from transformers import MarianMTModel, MarianTokenizer



In [22]:
first_model_name = 'Helsinki-NLP/opus-mt-ga-en' #Irish to English model

first_model_tkn = MarianTokenizer.from_pretrained(first_model_name)

first_model = MarianMTModel.from_pretrained(first_model_name)

In [23]:
second_model_name = 'Helsinki-NLP/opus-mt-en-ga'   #English to Irish model

second_model_tkn = MarianTokenizer.from_pretrained(second_model_name)

second_model = MarianMTModel.from_pretrained(second_model_name)

In [24]:
import pandas as pd
# Raw txt files of Dáil Scripts unannotated
dail1 = '/content/Dáil_1_no_dates.txt'
dail2 = '/content/Dáil_2_no_dates.txt'

dail_lines = []

def read_file(file_path, lines_list):
    with open(file_path, 'r') as file:
        lines = [line.strip() for line in file if line.strip()]
        lines_list.extend(lines)

read_file(dail1, dail_lines)
read_file(dail2, dail_lines)

print(len(dail_lines))
print(dail_lines[:-1])


268
['Ar dtús báire, leanfaidh mé leis an téama a bhí á lua ag an Teachta Burke, is é sin ag gabháil buíochas leo siúd atá ag obair go dian dícheallach sa chóras sláinte, ag tabhairt cúram d’othair agus dóibh siúd a bhíonn go minic in ísle brí nuair a shroicheann siad an córas slainte, cibé in ospidéal, ag an dochtúir, nó ag freastal ar dhuine de ba daoine eile atá ag déileáil leo go gairmiúil lá i ndiaidh lae.', 'Ó thaobh pleanála de, tá an tospidéal nua leanaí sa cheantar agus níl aon phlean ann nuair a chríochnaíonn siadsan atá an tógáil, chun an tógáil a dhéanamh ar an ngá maidir le hOspidéal San Séamas nó chun Ospidéal Ollscoile an Choim do Mhná agus do Naíonáin a bhogadh go dtí an suíomh sin go fóill.', 'Táimid ag ullmhú do na Cluichí Oilimpeacha an bhliain seo chugainn agus ba chóir i bhfad Éireann níos mó airgid a bheith curtha i leataobh ag an stad seo chun cuidiú leo.', 'Ní gá dúinn ach smaoineamh ar daoine atá luaite cheana féin cosúil le Keith Earls, Johnny Sexton agus Kati

In [25]:
def format_batch_texts(language_code, batch_texts):

  formated_bach = [">>{}<< {}".format(language_code, text) for text in batch_texts]

  return formated_bach

In [26]:
def perform_translation(batch_texts, model, tokenizer, language="en", batch_size=10):
    translated_texts = []
    for i in range(0, len(batch_texts), batch_size):
        batch = batch_texts[i:i+batch_size]

        formated_batch_texts = format_batch_texts(language, batch)

        translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512))

        batch_translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

        translated_texts.extend(batch_translated_texts)

    return translated_texts

translated_texts = perform_translation(dail_lines, first_model, first_model_tkn)
print(translated_texts)


['First of all, I will follow the theme mentioned by the Burke Deputy, that is accompanied by thanks to those who are hard working diligently in the health system, taking care of patients and those who often have a low meaning when they reach the health system, whether in a hospital, by a doctor, or serving one of them was other people who are dealing with them professionally on a day after day.', "The new children's hospital is in the area and there is no plan for the completion of the construction, for the construction of the need to move St. James's Hospital or the Crime for Women's and Infants University Hospital to that location yet.", 'We are preparing for the Olympic Games next year and much Ireland should have more money aside at this stop to help them.', 'We only need to think of people who have already been mentioned like Keith Earls, Johnny Sexton and Katie Taylor and others who are out there carrying the Irish flag and standing with pride on behalf of Ireland.', 'I refer to

In [27]:
print(len(translated_texts))

268


In [28]:
back_translated_texts = perform_translation(dail_lines, second_model, second_model_tkn)
print(back_translated_texts[5])
print(len(back_translated_texts))

Arsa an Dáil ar fhoinní ag 8.46 p.m. agus cuireadh chun báis ag 8.49 p.m.
268


In [29]:
back_translated_texts_file_path = "/content/backtranslated_text.txt"

with open(back_translated_texts_file_path, "w", encoding="utf-8") as file:
    for sentence in back_translated_texts:
        file.write(sentence + "\n")

print(f"Back-translated texts exported to {back_translated_texts_file_path}")

Back-translated texts exported to /content/backtranslated_text.txt


In [30]:
token_label_mapping = {}
# Get gold standard tags for entities from training data
for sentence in train_data:
    for token, label in sentence:
        normalized_token = token.lower()
        if normalized_token not in token_label_mapping:
            token_label_mapping[normalized_token] = label

formatted_back_translation = []
for sentence in back_translated_texts:

    tokens = sentence.split()
    formatted_sentence = []
    for token in tokens:
        normalized_token = token.lower()

        label = token_label_mapping.get(normalized_token, 'O')
        formatted_sentence.append((token, label))
    formatted_back_translation.append(formatted_sentence)

print(formatted_back_translation[5])
print(len(formatted_back_translation))


[('Arsa', 'O'), ('an', 'O'), ('Dáil', 'B-ORG'), ('ar', 'O'), ('fhoinní', 'O'), ('ag', 'O'), ('8.46', 'O'), ('p.m.', 'O'), ('agus', 'O'), ('cuireadh', 'O'), ('chun', 'O'), ('báis', 'O'), ('ag', 'O'), ('8.49', 'O'), ('p.m.', 'O')]
268


In [31]:
token_label_mapping = {}

for sentence in train_data:
    for token, label in sentence:
        if token not in token_label_mapping:
            token_label_mapping[token] = label


In [32]:
formatted_back_translation = []
for sentence in back_translated_texts:
    tokens = sentence.split()
    formatted_sentence = []
    for token in tokens:
        normalized_token = token
        label = token_label_mapping.get(normalized_token, 'O')
        formatted_sentence.append((normalized_token, label))
    formatted_back_translation.append(formatted_sentence)

print(formatted_back_translation[5])

[('Arsa', 'O'), ('an', 'O'), ('Dáil', 'B-ORG'), ('ar', 'O'), ('fhoinní', 'O'), ('ag', 'O'), ('8.46', 'O'), ('p.m.', 'O'), ('agus', 'O'), ('cuireadh', 'O'), ('chun', 'O'), ('báis', 'O'), ('ag', 'O'), ('8.49', 'O'), ('p.m.', 'O')]


In [33]:
print(formatted_back_translation[0])

[('Is', 'O'), ('é', 'O'), ('atá', 'O'), ('i', 'O'), ('gceist', 'O'), ('le', 'O'), ('‘Ar', 'O'), ('dtiús’,', 'O'), ('a', 'O'), ('d’iarr', 'O'), ('an', 'O'), ('tAire', 'O'), ('Ealaíon,', 'O'), ('Oidhreachta', 'I-PER'), ('agus', 'O'), ('Gaeltachta’', 'O'), ('ná', 'O'), ('an', 'O'), ('t-amhrán', 'O'), ('is', 'O'), ('fearr', 'O'), ('le', 'O'), ('rá', 'O'), ('go', 'O'), ('minic', 'O'), ('i', 'O'), ('gceann', 'O'), ('gach', 'O'), ('bliain', 'O'), ('go', 'O'), ('dtí', 'O'), ('an', 'O'), ('chéad', 'O'), ('lá', 'O'), ('den', 'O'), ('saol,', 'O'), ('agus', 'O'), ('an', 'O'), ('t-aon', 'O'), ('lá', 'O'), ('den', 'O'), ('saol,', 'O'), ('agus', 'O'), ('an', 'O'), ('t-aon', 'O'), ('lá,', 'O'), ('an', 'O'), ('t-aon', 'O'), ('lá', 'O'), ('den', 'O'), ('saol,', 'O'), ('agus', 'O'), ('an', 'O'), ('t-aon', 'O'), ('lá', 'O'), ('amháin.', 'O')]


In [34]:
back_translated_texts_file_path = "/content/back_translated_texts_labelled.txt"

with open(back_translated_texts_file_path, 'w', encoding='utf-8') as file:
    for sentence in formatted_back_translation:
        for word, tag in sentence:
            file.write(f"{word} {tag}\n")
        file.write("\n")


print(f"Back-translated texts exported to {back_translated_texts_file_path}")

Back-translated texts exported to /content/back_translated_texts_labelled.txt


In [35]:
file_path = "/content/backtranslated.conll"
with open(file_path, 'w', encoding='utf-8') as file:
    for sentence in formatted_back_translation:
        for word, tag in sentence:
            file.write(f"{word} {tag}\n")
        file.write("\n")

file_path


'/content/backtranslated.conll'

In [36]:
len(train_data)

1006

In [38]:
backtranslated = '/content/backtranslated.conll'

In [39]:
backtranslated_data = read_conll_file(backtranslated)

In [40]:
print(len(backtranslated_data))

268


In [41]:
train_data = train_data + backtranslated_data

In [42]:
len(formatted_back_translation)

268

In [43]:
len(train_data)

1274

In [44]:
print(train_data[1228])

[('An', 'B-ORG'), ('t-ainm', 'O'), ('a', 'O'), ('bhí', 'O'), ('le', 'O'), ('rá', 'O'), ('ag', 'O'), ('an', 'O'), ('Taoiseach', 'B-PER'), ('ar', 'O'), ('an', 'O'), ('lá', 'O'), ('roimh', 'O'), ('an', 'O'), ('gcruinniú', 'O'), ('agus,', 'O'), ('mar', 'O'), ('a', 'O'), ('bhí', 'O'), ('sa', 'O'), ('phost,', 'O'), ('téigh', 'O'), ('go', 'O'), ('dtí', 'O'), ('an', 'O'), ('chéad', 'O'), ('chruinniú', 'O'), ('den', 'O'), ('chéad', 'O'), ('uair', 'O'), ('riamh', 'O'), ('do', 'O'), ('TG4', 'B-ORG'), ('an', 'O'), ('t-ainm', 'O'), ('atá', 'O'), ('ar', 'O'), ('TG4', 'B-ORG'), ('an', 'O'), ('t-ainm', 'O'), ('atá', 'O'), ('ar', 'O'), ('siúl', 'O'), ('ar', 'O'), ('an', 'O'), ('bhfód', 'O'), ('seo', 'O'), ('do', 'O'), ('S4C', 'B-ORG'), ('do', 'O'), ('chraoladh', 'O'), ('i', 'O'), ('mBreatnais?', 'O')]


In [45]:
file_path = "/content/backtranslated_training.conll"

with open(file_path, "w", encoding="utf-8") as file:
    for training in train_data:
        for token, tag in training:
            file.write(f"{token}\t{tag}\n")
        file.write("\n")

print(f"Data exported to {file_path}")

Data exported to /content/backtranslated_training.conll
