In [1]:
import re
import json
import random
from collections import defaultdict, OrderedDict, Counter

In [2]:
data_dir = "/Users/lvzhiheng/Desktop/data/travel_KG"
baike_dir = os.path.join(data_dir, "带url的正文和摘要triple/第二轮")

In [3]:
coarse_types = ["建筑", "景点", "老字号门店", "人物", "文物", "组织机构"]
# types that cannot be identified from text
non_expressive_types = ["文化节庆场所", "亲子游景区"]

# convert type names to full path names, e.g. "名人故居" -> "/景点/人文历史景点/名人故居"
parent_class = {}
with open(os.path.join(data_dir, "subclassof.txt"), "r") as file:
    for line in file:
        category, parent_category = line.strip().split()
        parent_class[category] = parent_category

# query up parent_class until root
full_type_name = {}
for category in parent_class:
    full_name = category
    parent_category = parent_class[category]
    while parent_category != "":
        full_name = f"{parent_category}/{full_name}"
        parent_category = parent_class[parent_category] if parent_category in parent_class else ""
    full_type_name[category] = "/" + full_name

full_type_name.update({type_name: f"/{type_name}" for type_name in coarse_types})

# for type_name, full_name in full_type_name.items():
#     print(type_name, "->", full_name)

# read entity types from files
entity_types = defaultdict(list)
with open(os.path.join(data_dir, "instanceOf_jd.txt"), "r") as file:
    for line in file:
        entity, type_name = line.strip().split()
        # NOTE filter non_expressive_types compared with v1.x
        if type_name in non_expressive_types:
            continue
        type_name = full_type_name[type_name]
        if type_name not in entity_types[entity]:
            entity_types[entity].append(type_name)
print(f"#entity with fine-grained types: {len(entity_types)}")

for type_name in coarse_types:
    file_path = os.path.join(data_dir, f"entities/{type_name}.txt")
    with open(file_path, "r") as file:
        for line in file:
            entity = line.strip()
            full_name = full_type_name[type_name]
            if full_name in entity_types[entity]:
                continue
            entity_types[entity].append(full_name)
print(f"#total entities: {len(entity_types)}")

# add all up-level types to entity_types
for entity, types in entity_types.items():
    expand_types = []
    for type_name in types:
        levels = type_name[1:].split("/")
        for idx in range(1, len(levels) + 1):
            parent_type = "/" + "/".join(levels[:idx])
            if parent_type not in expand_types:
                expand_types.append(parent_type)
    entity_types[entity] = expand_types

#entity with fine-grained types: 1539
#total entities: 62073


In [4]:
# NOTE revise test examples manually
prev_test_file = os.path.join(data_dir, "FET_v1.0/test.json")

with open(prev_test_file, "r") as file:
    prev_test_examples = json.load(file)
single_label_examples = [example for example in prev_test_examples if len(example["labels"]) == 1]

# multi_label examples are manually revised
with open(os.path.join(data_dir, "FET/multi_label_split1_tag.json"), "r") as reader:
    multi_label_split1 = json.load(reader)
with open(os.path.join(data_dir, "FET/multi_label_split2_tag.json"), "r") as reader:
    multi_label_split2 = json.load(reader)
multi_label_examples = multi_label_split1 + multi_label_split2

# remove empty examples
multi_label_examples = [example for example in multi_label_examples if len(example["labels"]) > 0]

# remove "文化节庆场所", "亲子游景区" types
for example in multi_label_examples:
    filtered_labels = [label for label in example["labels"] if "文化节庆场所" not in label and "亲子游景区" not in label]
    example["labels"] = filtered_labels

# test_entities for avoiding train & test entity overlapping
test_examples = single_label_examples + multi_label_examples
test_entities = set()
for example in test_examples:
    sentence = example["sent"]
    entity = sentence[example["start"]:example["end"]]
    test_entities.add(entity)
    # sanity check
    assert len(example["labels"]) > 0
    assert all("文化节庆场所" not in label and "亲子游景区" not in label for label in example["labels"])

In [5]:
def extract_sentences(doc):
    content = re.sub(r"([。？！])", r"\1\n", doc)
    sentences = []
    for sentence in content.split():
        if len(sentence) < 4:
            continue
        sentences.append(sentence)
    return sentences


examples = []
# DS to build FET dataset
for file_name in os.listdir(baike_dir):
    file_path = os.path.join(baike_dir, file_name)
    print(f"Read {file_path}...")

    with open(file_path, "r") as file:
        for line in file:
            # name  (full name in baike)  url  abstract/article
            # (full name) / abstract / article may be absent
            parts = line.split("\t\t")
            article = parts[-1].strip()
            if len(article) == 0:
                continue

            paragraphs = article.split("::;")
            if paragraphs[0] not in ["AbstractHere", "ArticleHere"]:
                continue
            for paragraph in paragraphs[1:]:
                if paragraph.startswith("==") and paragraph.endswith("=="):
                    continue

                for sentence in extract_sentences(paragraph):
                    # find anchor links
                    anchor_spans = [match.span() for match in re.finditer(r"\[\[(.*?)\|(.*?)\]\]", sentence)]
                    if len(anchor_spans) == 0:
                        continue
                    text, entity_spans = [], []
                    end_point, length = 0, 0
                    for start, end in anchor_spans:
                        text.append(sentence[end_point:start])
                        length += start - end_point
                        # replace anchor link with raw text
                        entity, url = sentence[start:end][2:-2].split("|")
                        text.append(entity)
                        entity_spans.append([length, length + len(entity)])
                        length += len(entity)
                        end_point = end
                    text.append(sentence[end_point:])
                    text = "".join(text)

                    for start, end in entity_spans:
                        entity = text[start:end]
                        # TODO: more accurate ways to get entity types
                        if entity not in entity_types or entity in test_entities:
                            continue
                        examples.append(OrderedDict([
                            ("sent", text),
                            ("labels", entity_types[entity]),
                            ("start", start),
                            ("end", end)
                        ]))

Read /Users/lvzhiheng/Desktop/data/travel_KG/带url的正文和摘要triple/第二轮/new_组织机构_bd_abstract.txt...
Read /Users/lvzhiheng/Desktop/data/travel_KG/带url的正文和摘要triple/第二轮/new_组织机构_bd_article.txt...
Read /Users/lvzhiheng/Desktop/data/travel_KG/带url的正文和摘要triple/第二轮/no_article_baike.txt...
Read /Users/lvzhiheng/Desktop/data/travel_KG/带url的正文和摘要triple/第二轮/人物_bd_article.txt...
Read /Users/lvzhiheng/Desktop/data/travel_KG/带url的正文和摘要triple/第二轮/new_建筑_bd_abstract.txt...
Read /Users/lvzhiheng/Desktop/data/travel_KG/带url的正文和摘要triple/第二轮/new_建筑_bd_article.txt...
Read /Users/lvzhiheng/Desktop/data/travel_KG/带url的正文和摘要triple/第二轮/景点_1414_newFile_bd_article.txt...
Read /Users/lvzhiheng/Desktop/data/travel_KG/带url的正文和摘要triple/第二轮/人物_bd_abstract.txt...
Read /Users/lvzhiheng/Desktop/data/travel_KG/带url的正文和摘要triple/第二轮/景点_1414_newFile_bd_abstract.txt...
Read /Users/lvzhiheng/Desktop/data/travel_KG/带url的正文和摘要triple/第二轮/new_文物_bd_article.txt...
Read /Users/lvzhiheng/Desktop/data/travel_KG/带url的正文和摘要triple/第二轮/new_文物_

In [6]:
train_file = os.path.join(data_dir, "FET/train.json")
dev_file = os.path.join(data_dir, "FET/dev.json")
test_file = os.path.join(data_dir, "FET/test.json")


def remove_duplication(examples):
    example_ids = set()
    non_dup_examples = []
    for example in examples:
        # same (sent, start, end) as the same example
        example_id = (example["sent"], example["start"], example["end"])
        if example_id in example_ids:
            continue
        non_dup_examples.append(example)
        example_ids.add(example_id)
    return non_dup_examples


examples = remove_duplication(examples)

# random split to train & dev
entity_examples = defaultdict(list)
for idx, example in enumerate(examples):
    entity = example["sent"][example["start"] : example["end"]]
    entity_examples[entity].append(idx)

all_entity = list(entity_examples.keys())
random.seed(1234)
random.shuffle(all_entity)

# ensure train dev no entity overlap
# TODO: better ways to compare entites (e.g. entity id) rather than text
train_examples, dev_examples = [], []
n_entity = 0
while n_entity < len(all_entity) and len(train_examples) < len(examples) * 0.9:
    entity = all_entity[n_entity]
    train_examples += [examples[idx] for idx in entity_examples[entity]]
    n_entity += 1

while n_entity < len(all_entity):
    entity = all_entity[n_entity]
    dev_examples += [examples[idx] for idx in entity_examples[entity]]
    n_entity += 1
print(f"#train_examples: {len(train_examples)}, ratio: {len(train_examples) / len(examples)}")
print(f"#dev_examples: {len(dev_examples)}, ratio: {len(dev_examples) / len(examples)}")


test_examples = remove_duplication(test_examples)
# train dataset has no "/景点/自然风光/池塘" label
test_examples = [example for example in test_examples if "/景点/自然风光/池塘" not in example["labels"]]

for target_file, split_examples in zip([train_file, dev_file, test_file],
                                       [train_examples, dev_examples, test_examples]):
    with open(target_file, "w") as writer:
        json.dump(split_examples, writer, ensure_ascii=False, indent=4)

#train_examples: 168679, ratio: 0.9002166766288106
#dev_examples: 18697, ratio: 0.09978332337118948


In [7]:
# train data sentence length distribution
import numpy as np

def dataset_stat(examples):
    print("sentence length percentage")
    sent_lens = np.array([len(example["sent"]) for example in examples])
    for i in range(90, 101):
        print(i, np.percentile(sent_lens, i))

    label_set = set()
    n_fine_grained = 0
    for example in examples:
        label_set.update(example["labels"])
        if len(example["labels"]) > 1:
            n_fine_grained += 1

    print(f"\n#examples in dataset: {len(examples)}")
    print(f"#labels in dataset: {len(label_set)}")
    print(f"#fine grained examples (> 1 labels) ratio: {n_fine_grained / len(examples)}")

    # example type distribution
    type_count = Counter()
    for example in examples:
        type_count.update(example["labels"])
    print("\ntype distribution")
    for name, count in type_count.most_common():
        print(name, count, f"{count / len(examples):.5f}")

n_fine_grained_entity = sum(1 if len(types) > 1 else 0 for entity, types in entity_types.items())
print(f"ratio of entities having > 1 types: {n_fine_grained_entity / len(entity_types)}")

ratio of entities having > 1 types: 0.0319462568266396


In [8]:
dataset_stat(train_examples)

sentence length percentage
90 114.0
91 119.0
92 125.0
93 133.0
94 143.0
95 154.0
96 171.0
97 195.0
98 226.44000000000233
99 291.0
100 1838.0

#examples in dataset: 168679
#labels in dataset: 32
#fine grained examples (> 1 labels) ratio: 0.018022397571719063

type distribution
/人物 117127 0.69438
/组织机构 42898 0.25432
/建筑 7301 0.04328
/文物 2063 0.01223
/景点 1840 0.01091
/景点/场馆 333 0.00197
/景点/自然风光 284 0.00168
/景点/宗教场所 237 0.00141
/景点/自然风光/野外景色 220 0.00130
/景点/宗教场所/寺庙 211 0.00125
/景点/场馆/文化展馆 185 0.00110
/景点/场馆/文化展馆/博物馆 185 0.00110
/景点/人文历史景点 142 0.00084
/景点/场馆/公园场馆 133 0.00079
/景点/场馆/公园场馆/公园 121 0.00072
/景点/人文历史景点/红色旅游景区 55 0.00033
/景点/人文历史景点/名人故居 44 0.00026
/景点/自然风光/山峰 43 0.00025
/景点/自然风光/湖泊 42 0.00025
/景点/自然风光/池塘 36 0.00021
/老字号门店 29 0.00017
/景点/人文历史景点/世界文化遗址 27 0.00016
/景点/宗教场所/教堂 26 0.00015
/景点/场馆/公园场馆/游乐园 19 0.00011
/景点/人文历史景点/皇家园林 19 0.00011
/景点/自然风光/河流 16 0.00009
/景点/自然风光/温泉 15 0.00009
/景点/场馆/演出场馆 15 0.00009
/景点/场馆/演出场馆/剧院 14 0.00008
/景点/场馆/演出场馆/小剧场 9 0.00005
/景点/人文历史景点/胡同 6 0.00004
/景

In [9]:
dataset_stat(dev_examples)

sentence length percentage
90 109.0
91 114.0
92 120.0
93 126.0
94 135.0
95 146.0
96 160.0
97 182.11999999999898
98 216.0799999999981
99 275.0
100 1194.0

#examples in dataset: 18697
#labels in dataset: 23
#fine grained examples (> 1 labels) ratio: 0.011713109054928597

type distribution
/人物 13941 0.74563
/组织机构 3845 0.20565
/建筑 874 0.04675
/景点 142 0.00759
/文物 83 0.00444
/景点/自然风光 40 0.00214
/景点/自然风光/野外景色 32 0.00171
/景点/宗教场所 21 0.00112
/景点/宗教场所/寺庙 21 0.00112
/景点/自然风光/山峰 14 0.00075
/景点/场馆 7 0.00037
/老字号门店 5 0.00027
/景点/人文历史景点 4 0.00021
/景点/人文历史景点/名人故居 4 0.00021
/景点/场馆/文化展馆 4 0.00021
/景点/场馆/文化展馆/博物馆 4 0.00021
/景点/自然风光/湖泊 3 0.00016
/景点/场馆/公园场馆 2 0.00011
/景点/场馆/公园场馆/游乐园 1 0.00005
/景点/场馆/公园场馆/公园 1 0.00005
/景点/场馆/演出场馆 1 0.00005
/景点/场馆/演出场馆/小剧场 1 0.00005
/景点/场馆/演出场馆/剧院 1 0.00005


In [10]:
dataset_stat(test_examples)

sentence length percentage
90 101.0
91 105.0
92 107.0
93 111.0
94 114.0
95 118.0
96 125.0
97 128.0
98 133.0
99 140.14999999999964
100 150.0

#examples in dataset: 4586
#labels in dataset: 30
#fine grained examples (> 1 labels) ratio: 0.2030091583078936

type distribution
/人物 1591 0.34693
/景点 1057 0.23048
/建筑 996 0.21718
/组织机构 749 0.16332
/景点/人文历史景点 568 0.12386
/文物 544 0.11862
/景点/场馆 197 0.04296
/景点/自然风光 143 0.03118
/景点/场馆/公园场馆 115 0.02508
/景点/自然风光/野外景色 112 0.02442
/景点/场馆/公园场馆/公园 110 0.02399
/景点/宗教场所 98 0.02137
/景点/宗教场所/寺庙 91 0.01984
/景点/人文历史景点/世界文化遗址 69 0.01505
/景点/场馆/文化展馆 53 0.01156
/景点/场馆/文化展馆/博物馆 46 0.01003
/景点/场馆/演出场馆/剧院 31 0.00676
/景点/场馆/演出场馆 31 0.00676
/景点/自然风光/山峰 26 0.00567
/景点/人文历史景点/皇家园林 25 0.00545
/景点/人文历史景点/红色旅游景区 23 0.00502
/景点/场馆/演出场馆/小剧场 17 0.00371
/老字号门店 11 0.00240
/景点/自然风光/湖泊 11 0.00240
/景点/人文历史景点/名人故居 10 0.00218
/景点/人文历史景点/胡同 10 0.00218
/景点/宗教场所/教堂 5 0.00109
/景点/场馆/公园场馆/游乐园 4 0.00087
/景点/自然风光/温泉 3 0.00065
/景点/自然风光/雪山 1 0.00022
