In [4]:
import re

def sentence_segment(match_regex, tokens):
    """
    Splits a sequence of tokens into sentences, splitting wherever the given matching regular expression
    matches.

    Parameters
    ----------
    tokens      the input sequence as list of strings (each item is a ``word'')
    match_regex the regular expression that defines at which token to split.

    Returns
    -------
    a list of lists of strings, where each string is a word, and each inner list
    represents a sentence.
    """
    sentences = [[]]
    for tok in tokens:
        sentences[-1].append(tok)
        if match_regex.match(tok):
            sentences.append([])
            
    if sentences[-1] == []:
        del sentences[-1]
    return sentences

def load_entity_dict(file_path):
    entity_dict = {}
    with open(file_path, encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split("\t")
            if len(parts) >= 3:  
                entity_name = parts[0]
                entity_type = parts[2] 
                entity_dict[entity_name] = entity_type
    return entity_dict


In [5]:
def tokenize(text):
    token = re.compile(r'(?:[a-zA-Z]\.){2,}|http://www\.\w+\.\w+|[\w\']+|[0-9]+\.[0-9]+|[0-9]+|[.?!",%-\(\)-]+')

    open_text = open(text, 'r', encoding='utf-8')
    raw_text = open_text.readlines()[4:]
    raw_text = ' '.join(raw_text)
    #print(raw_text)

    tokens = token.findall(raw_text)
    #print(tokens)
    sentences = sentence_segment(re.compile(r'[\.\?\!]'), tokens)
    #for sentence in sentences:
        #print(sentence)
    open_text.close()
    return sentences

In [6]:
import string
def tag_my_data(text, ent_dict):
    #text=[['Саджид', 'Джавид', 'новоназначеният', 'вътрешен', 'министър', 'след', 'оставката', 'на', 'Ръд', 'обеща', 'да', 'осигури', 'отношение', 'на', 'порядъчност', 'и', 'честност', 'към', 'хората', 'засегнати', 'от', 'скандала', 'Уиндръш', '.']]
    data=[]
    for sentence in text: 

        tags = ["O"] * len(sentence)  # Default to "O" (Outside)
    
        for entity, tag in ent_dict.items():
            entity_tokens = entity.split()
            entity_length = len(entity_tokens)

            for i in range(len(sentence) - entity_length + 1):
                if sentence[i:i+entity_length] == entity_tokens:
                    tags[i] = f"B-{tag}"  # Begin entity
                    for j in range(1, entity_length):
                        tags[i + j] = f"I-{tag}"  # Inside entity
        data.append(list(zip(sentence, tags)))
    return data

def to_conll(doc_id, data):
    output = []
    sent_count = 1
    for sentence in data:
        words = [word for word, tag in sentence]
        original_sent=" ".join(words)
        sent_id = f"{doc_id}-{sent_count}"
        output.append(f"# sent_id = {sent_id}")
        output.append(f"# text = {original_sent}")

        for word, tag in sentence:

            output.append(f"{word}\t{tag}")
        output.append("")
                
        output.append("")  # Blank line separates sentences
        sent_count += 1

    return "\n".join(output)


In [7]:
text="brexit_bg.txt_file_1061.txt"

def aggregate(doc_id, text, entity_dict):
    data=tokenize(text)
    bio_output = tag_my_data(data, entity_dict)
    dataset=to_conll(doc_id, bio_output)
    return dataset

#print("BIO-tagged data saved to 'manual.txt'.")

In [8]:
import os
dataset=""

for file in os.listdir("data/bg/raw"):
    doc_id = "bg_" +  re.findall(r'\d+', file)[0]  # Dummy doc ID, change as needed
    dataset+=f"# newdoc id = {doc_id}"
    if file.endswith(".txt"):
        text="data/bg/raw/"+file
        labels=file[:-4]+".out"
        entity_dict = load_entity_dict("data/bg/annotated/"+labels)
        #print(file, " ", text, " ", labels)
        constructed_file=aggregate(doc_id, text, entity_dict)
        dataset+="\n"+constructed_file
    
    

with open("data/train/bg.txt", "w", encoding="utf-8") as f:
    f.write(dataset)

print("BIO-tagged data saved to 'data/train/bg.txt'.")

BIO-tagged data saved to 'data/train/bg.txt'.


In [9]:
dataset=""

for file in os.listdir("data/uk/raw"):
    doc_id = "uk_" +  re.findall(r'\d+', file)[0]  # Dummy doc ID, change as needed
    dataset+=f"# newdoc id = {doc_id}"
    if file.endswith(".txt"):
        text="data/uk/raw/"+file
        labels=file[:-4]+".out"
        entity_dict = load_entity_dict("data/uk/annotated/"+labels)
        constructed_file=aggregate(doc_id, text, entity_dict)
        dataset+="\n"+constructed_file
    
    

with open("data/train/uk.txt", "w", encoding="utf-8") as f:
    f.write(dataset)

print("BIO-tagged data saved to 'data/train/uk.txt'.")

BIO-tagged data saved to 'data/train/uk.txt'.


In [12]:

dataset=""

for file in os.listdir("data/sl/raw"):
    doc_id = "sl_" +  re.findall(r'\d+', file)[0]  # Dummy doc ID, change as needed
    dataset+=f"# newdoc id = {doc_id}"
    if file.endswith(".txt"):
        text="data/sl/raw/"+file
        labels=file[:-4]+".out"
        entity_dict = load_entity_dict("data/sl/annotated/"+labels)
        constructed_file=aggregate(doc_id, text, entity_dict)
        dataset+="\n"+constructed_file
    
    

with open("data/train/sl.txt", "w", encoding="utf-8") as f:
    f.write(dataset)

print("BIO-tagged data saved to 'data/train/sl.txt'.")

BIO-tagged data saved to 'data/train/sl.txt'.


In [21]:
dataset=""

for file in os.listdir("data/pl/raw"):
    doc_id = "pl_" +  re.findall(r'\d+', file)[0]  # Dummy doc ID, change as needed
    dataset+=f"# newdoc id = {doc_id}"
    if file.endswith(".txt"):
        text="data/pl/raw/"+file
        labels=file[:-4]+".out"
        entity_dict = load_entity_dict("data/pl/annotated/"+labels)
        constructed_file=aggregate(doc_id, text, entity_dict)
        dataset+="\n"+constructed_file
    
    

with open("data/train/pl.txt", "w", encoding="utf-8") as f:
    f.write(dataset)

print("BIO-tagged data saved to 'data/train/pl.txt'.")

BIO-tagged data saved to 'data/train/pl.txt'.


In [14]:
dataset=""

for file in os.listdir("data/cs/raw"):
    doc_id = "cs_" +  re.findall(r'\d+', file)[0]  # Dummy doc ID, change as needed
    dataset+=f"# newdoc id = {doc_id}"
    if file.endswith(".txt"):
        text="data/cs/raw/"+file
        labels=file[:-4]+".out"
        entity_dict = load_entity_dict("data/cs/annotated/"+labels)
        constructed_file=aggregate(doc_id, text, entity_dict)
        dataset+="\n"+constructed_file
    
    

with open("data/train/cs.txt", "w", encoding="utf-8") as f:
    f.write(dataset)

print("BIO-tagged data saved to 'data/train/cs.txt'.")

BIO-tagged data saved to 'data/train/cs.txt'.


In [15]:
dataset=""

for file in os.listdir("data/ru/raw"):
    doc_id = "ru_" +  re.findall(r'\d+', file)[0]  # Dummy doc ID, change as needed
    dataset+=f"# newdoc id = {doc_id}"
    if file.endswith(".txt"):
        text="data/ru/raw/"+file
        labels=file[:-4]+".out"
        entity_dict = load_entity_dict("data/ru/annotated/"+labels)
        constructed_file=aggregate(doc_id, text, entity_dict)
        dataset+="\n"+constructed_file
    
    

with open("data/train/ru.txt", "w", encoding="utf-8") as f:
    f.write(dataset)

print("BIO-tagged data saved to 'data/train/ru.txt'.")

BIO-tagged data saved to 'data/train/ru.txt'.
