In [1]:
PATH = "../../"

def read_annotations(filename):
    """Read annotations and positions of annotations from a *.tok.ann file"""
    anno = []
    with open(filename, "r") as f:
        for line in f.readlines():
            annotations = line.split()
            anno.append((annotations[1], int(annotations[2]), int(annotations[3])))
    return anno

def read_sentences(filename):
    """Read tokens and positions of tokens from a *.tok.txt file"""
    sentences = []
    pos = 0
    with open(filename, "r") as f:
        text = f.read().split("\n")
        for line in text:
            tokens = []
            if len(line) == 0:
                pos += 1
            else:
                line = line.split(" ")
                for i in range(len(line)):
                    token = line[i]
                    tokens.append((token, pos, pos + len(token)))
                    pos += len(token) + 1
                sentences.append(tokens)
    return sentences

def extract_labels(anno, sentences):
    """Extract labels for tokens"""
    sent_labels = []
    ann_id = 0
    for tokens in sentences:
        labels = []
        for token in tokens:
            if ann_id < len(anno):
                label, beg, end = anno[ann_id]
                if token[1] < beg:
                    labels.append("O")
                else:
                    if token[1] == beg:
                        labels.append("B-" + label)
                    else:
                        labels.append("I-" + label)
                    if token[2] == end:
                        ann_id += 1
        else:
            labels.append("O")
        sent_labels.append(labels)
    return sent_labels

In [2]:
dev_test = {"dev": [], "test": []}
category = ""
with open(PATH + "doc/dev-test-split.txt", "r") as f:
    for line in f.readlines():
        line = line.strip()
        if line in ["DEV", "TEST"]:
            category = line.lower()
        elif len(line) == 0:
            continue
        else:
            dev_test[category].append(line)

print(len(dev_test["dev"]), len(dev_test["test"]))

182 80


In [3]:
train_sentences, test_sentences, train_labels, test_labels = [], [], [], []

for filename in dev_test["dev"]:
    sentences = read_sentences(PATH + "data/" + filename + ".txt")
    train_sentences += sentences
    train_labels += extract_labels(read_annotations(PATH + "data/" + filename + ".ann"), sentences)

for filename in dev_test["test"]:
    sentences = read_sentences(PATH + "data/" + filename + ".txt")
    test_sentences += sentences
    test_labels += extract_labels(read_annotations(PATH + "data/" + filename + ".ann"), sentences)

In [4]:
with open("dev.txt", "w") as f:
    iob_sents = []
    for (sentence, lables) in zip(train_sentences, train_labels):
        iob_sents.append("\n".join([i[0] + " " + j for (i, j) in zip(sentence, lables)]))
    f.write("\n\n".join(iob_sents))

with open("test.txt", "w") as f:
    iob_sents = []
    for (sentence, lables) in zip(test_sentences, test_labels):
        iob_sents.append("\n".join([i[0] + " " + j for (i, j) in zip(sentence, lables)]))
    f.write("\n\n".join(iob_sents))