In [1]:
import json
import numpy as np
from collections import OrderedDict

## FIGER dataset explore

In [2]:
data_dir = "/data1/lzh/data/ernie_data/FIGER"
train_file = os.path.join(data_dir, "train.json")
dev_file = os.path.join(data_dir, "dev.json")
test_file = os.path.join(data_dir, "test.json")

In [3]:
def split_stat(file_name):
    print(f"dataset file: {file_name}")
    with open(file_name, "r") as file:
        data = json.load(file)
    
    #sentences
    print(f"#sentences: {len(data)}")

    #labels (entity types)
    labels = set()
    for example in data:
        labels.update(example["labels"])
    print(f"#labels (entity types): {len(labels)}")

    # check if all parent classes are in labels
    no_parent_examples = 0
    for example in data:
        labels = example["labels"]
        for label in labels:
            level = label[1:].split("/")
            if len(level) > 1 and "/" + level[0] not in labels:
                no_parent_examples += 1
                break
    print(f"#examples that not all parent classes are in labels: {no_parent_examples}")

In [4]:
split_stat(train_file)

dataset file: /data1/lzh/data/ernie_data/FIGER/train.json
#sentences: 2000000
#labels (entity types): 113
#examples that not all parent classes are in labels: 0


In [3]:
with open(train_file, "r") as file:
    train_data = json.load(file)
with open(test_file, "r") as file:
    test_data = json.load(file)

# sentence length distribution
sent_lens = np.array([len(example["sent"]) for example in train_data])
for i in range(90, 101):
    print(i, np.percentile(sent_lens, i))

90 238.0
91 241.0
92 245.0
93 249.0
94 253.0
95 257.0
96 262.0
97 268.0
98 276.0
99 287.0
100 951.0


In [6]:
# entity overlap between train set and test set
train_entities = set(example["sent"][example["start"] : example["end"]] for example in train_data)
test_entities = set(example["sent"][example["start"] : example["end"]] for example in test_data)

print(f"#train_entities: {len(train_entities)}")
print(f"#test_entities: {len(test_entities)}")
print(f"#overlap entities: {len(train_entities & test_entities)}")

n_fine_grained = sum(1 if len(example["labels"]) > 1 else 0 for example in train_data)
print(f"ratio of examples with > 1 types in train data: {n_fine_grained / len(train_data)}")
n_fine_grained = sum(1 if len(example["labels"]) > 1 else 0 for example in test_data)
print(f"ratio of examples with > 1 types in test data: {n_fine_grained / len(test_data)}")

#train_entities: 592396
#test_entities: 331
#overlap entities: 137
ratio of examples with > 1 types in train data: 0.7572125
ratio of examples with > 1 types in test data: 0.325044404973357


In [7]:
split_stat(dev_file)

dataset file: /data1/lzh/data/ernie_data/FIGER/dev.json
#sentences: 10000
#labels (entity types): 113
#examples that not all parent classes are in labels: 0


In [8]:
split_stat(test_file)

dataset file: /data1/lzh/data/ernie_data/FIGER/test.json
#sentences: 563
#labels (entity types): 41
#examples that not all parent classes are in labels: 50


## Convert BIO NER data to FET data

In [9]:
traval_ner_dir = "/data1/lzh/exp/pretrain/tourism-transformers/utils/tner"
train_file = os.path.join(traval_ner_dir, "bio.txtl.train")
dev_file = os.path.join(traval_ner_dir, "bio.txtl.dev")
test_file = os.path.join(traval_ner_dir, "bio.txtl.test")

In [10]:
data = ""
for file_name in [train_file, dev_file, test_file]:
    with open(file_name, "r") as file:
        data += "\n\n" + file.read()

examples = data.strip().split("\n\n")
typing_examples = []
for example in examples:
    chars_with_bio = example.split("\n")
    sentence, spans = [], []
    label, start, end = None, -1, -1
    for idx, entry in enumerate(chars_with_bio):
        char, bio = entry.split(" ")
        sentence.append(char)

        assert bio[0] in "BIO"
        if bio[0] == "B":
            label = bio[2:]
            start, end = idx, idx
        elif bio[0] == "I":
            end = idx
        elif start != -1 and end != -1:
            spans.append([label, start, end + 1])
            start, end = -1, -1

    for label, start, end in spans:
        assert all("B" in chars_with_bio[idx] or "I" in chars_with_bio[idx] for idx in range(start, end))
        typing_examples.append(OrderedDict([
            ("sent", "".join(sentence)),
            ("labels", [label]),
            ("start", start),
            ("end", end)
        ]))

# dataset statistics
print(f"#typing examples: {len(typing_examples)}")
entities, labels = set(), set()
for example in typing_examples:
    start, end = example["start"], example["end"]
    entities.add(example["sent"][start:end])
    labels.update(example["labels"])
print(f"#entities: {len(entities)}")
print(f"#labels: {len(labels)} {labels}")

# save to dir
target_path = "/data1/lzh/data/tourism-FET/test.json"
with open(target_path, "w") as writer:
    json.dump(typing_examples, writer, ensure_ascii=False, indent=4)


#typing examples: 4693
#entities: 2488
#labels: 6 {'建筑', '人物', '文物', '门店', '组织机构', '景点'}
