# Imports

In [1]:
import os
from typing import List

In [2]:
dataset_folder_path = '../data/conll2003'
train_file = os.path.join(dataset_folder_path, 'train.txt')
valid_file = os.path.join(dataset_folder_path, 'valid.txt')
test_file = os.path.join(dataset_folder_path, 'test.txt')

# Classes

In [5]:
class InputExample:

    def __init__(self, source_words: List[str], target_words: List[str]):
        self.source_words = source_words
        self.target_words = target_words

    @staticmethod
    def join(tl: List[str], join_with: str = ' '):
        return join_with.join(tl)

    @property
    def source(self):
        return self.join(self.source_words)

    @property
    def target(self):
        return self.join(self.target_words)

    def __str__(self,):
        return f'Source: {self.source}\nTarget: {self.target}'

# Utils

Collection of functions that will be used during the notebook

In [6]:
def read_txt(filepath):
    with open(filepath) as f:
        return f.read()

In [56]:
def convert_text_to_example(text, split_line_by='\n', split_row_by=' ', merge_O = False):
    words, labels = [], []
    for row in text.split(split_line_by):
        ws = row.split(split_row_by)
        words.append(ws[0])
        labels.append(ws[-1])

    source_words = []
    target_words = []

    i = 0
    while len(source_words) < len(words):
        w = words[i]
        l = labels[i]

        if l == 'O':
            if merge_O:
                j = i + 1
                while j < len(labels) and labels[j] == 'O':
                    j += 1
                # adds the span
                source_words.extend(words[i:j])
                target_words.extend(words[i:j] + [f'<{l}>'])
                i = j
            else:
                source_words.append(w)
                target_words.extend([w, f'<{l}>'])
                i += 1
                continue
        else: # found a B-ENT
            j = i + 1
            ent_label = labels[i].split('-')[-1]
            while j < len(labels) and labels[j] == f'I-{ent_label}':
                j += 1
            # adds the span
            source_words.extend(words[i:j])
            target_words.extend(words[i:j] + [f'<{ent_label}>'])
            i = j

    return InputExample(source_words, target_words)

# Opening and converting one example

In [57]:
file_text = read_txt(train_file)

In [58]:
text_examples = file_text.split('\n\n')
text_examples[0], text_examples[-1]

('-DOCSTART- -X- -X- O', '')

As we see above, the first and last entries of the list can be ignored.
Next we convert one example to the input format needed

In [59]:
text_examples = text_examples[1:-1] # remove first and last

In [60]:
# As we see we only care about the first and last entry of each row
for i in range(3):
    te = text_examples[i].split('\n')
    for row in te:
        print(row)
    print()

EU NNP B-NP B-ORG
rejects VBZ B-VP O
German JJ B-NP B-MISC
call NN I-NP O
to TO B-VP O
boycott VB I-VP O
British JJ B-NP B-MISC
lamb NN I-NP O
. . O O

Peter NNP B-NP B-PER
Blackburn NNP I-NP I-PER

BRUSSELS NNP B-NP B-LOC
1996-08-22 CD I-NP O



In [61]:
te = text_examples[0].split('\n')
words, labels = [], []
for row in te:
    ws = row.split(' ')
    words.append(ws[0])
    labels.append(ws[-1])

for w, l in zip(words, labels):
    print(f'{w:20} {l}')

EU                   B-ORG
rejects              O
German               B-MISC
call                 O
to                   O
boycott              O
British              B-MISC
lamb                 O
.                    O


In [62]:
source_words = []
target_words = []
current_word = []

i = 0
while len(source_words) < len(words):
    w = words[i]
    l = labels[i]

    if l == 'O':
        source_words.append(w)
        target_words.extend([w, f'<{l}>'])
        i += 1
        continue

    else: # found a B-ENT
        j = i+1
        ent_label = labels[i].split('-')[-1]
        while labels[j] == f'I-{ent_label}' and j < len(labels):
            j += 1
        # adds the span

        source_words.extend(words[i:j])
        target_words.extend(words[i:j] + [f'<{ent_label}>'])
        i = j

In [63]:
print(f"Source text: {' '.join(source_words)}")
print(f"Target text: {' '.join(target_words)}")

Source text: EU rejects German call to boycott British lamb .
Target text: EU <ORG> rejects <O> German <MISC> call <O> to <O> boycott <O> British <MISC> lamb <O> . <O>


In [64]:
# Now with the function created
for i in range(5):
    example = convert_text_to_example(text_examples[i], merge_O=True)
    print(example)
    print()

Source: EU rejects German call to boycott British lamb .
Target: EU <ORG> rejects <O> German <MISC> call to boycott <O> British <MISC> lamb . <O>

Source: Peter Blackburn
Target: Peter Blackburn <PER>

Source: BRUSSELS 1996-08-22
Target: BRUSSELS <LOC> 1996-08-22 <O>

Source: The European Commission said on Thursday it disagreed with German advice to consumers to shun British lamb until scientists determine whether mad cow disease can be transmitted to sheep .
Target: The <O> European Commission <ORG> said on Thursday it disagreed with <O> German <MISC> advice to consumers to shun <O> British <MISC> lamb until scientists determine whether mad cow disease can be transmitted to sheep . <O>

Source: Germany 's representative to the European Union 's veterinary committee Werner Zwingmann said on Wednesday consumers should buy sheepmeat from countries other than Britain until the scientific advice was clearer .
Target: Germany <LOC> 's representative to the <O> European Union <ORG> 's veter