In [1]:
import utils
import numpy as np
from itertools import chain
from collections import Counter, defaultdict

In [2]:
# raw = utils.load_raw("./data/信息抽取_第一阶段/xxcq_small.json")
raw = utils.load_raw("./data/信息抽取_第二阶段/xxcq_mid.json")

## 文本长度统计

In [3]:
text_lengths = [len(line["text"]) for line in raw]

In [4]:
print(min(text_lengths), max(text_lengths))

5 439


## 标签统计

In [5]:
utils.LABEL_MEANING_MAP

{'NHCS': '犯罪嫌疑人',
 'NHVI': '受害人',
 'NCSM': '被盗货币',
 'NCGV': '物品价值',
 'NCSP': '盗窃获利',
 'NASI': '被盗物品',
 'NATS': '作案工具',
 'NT': '时间',
 'NS': '地点',
 'NO': '组织机构'}

In [6]:
entities = [line["entities"] for line in raw]
flatten_entities = list(chain(*entities))

In [7]:
entity_types = [entity[0] for entity in flatten_entities]

In [8]:
flatten_entities[0]

('NHVI', 22, 24, '严某某')

In [9]:
Counter(entity_types)

Counter({'NHVI': 3108,
         'NASI': 5781,
         'NO': 806,
         'NHCS': 6463,
         'NCGV': 2090,
         'NT': 2765,
         'NS': 3517,
         'NCSM': 915,
         'NATS': 735,
         'NCSP': 481})

In [10]:
entities_text_lengths = [len(entity[-1]) for entity in flatten_entities]

In [11]:
sorted(Counter(entities_text_lengths).items(), key=lambda x: x[0])

[(1, 79),
 (2, 1273),
 (3, 9809),
 (4, 1825),
 (5, 1468),
 (6, 1101),
 (7, 1366),
 (8, 1512),
 (9, 1090),
 (10, 962),
 (11, 851),
 (12, 1000),
 (13, 724),
 (14, 714),
 (15, 581),
 (16, 410),
 (17, 355),
 (18, 276),
 (19, 219),
 (20, 203),
 (21, 154),
 (22, 130),
 (23, 129),
 (24, 86),
 (25, 66),
 (26, 62),
 (27, 51),
 (28, 37),
 (29, 23),
 (30, 17),
 (31, 21),
 (32, 16),
 (33, 7),
 (34, 10),
 (35, 7),
 (36, 6),
 (37, 5),
 (38, 1),
 (39, 2),
 (40, 3),
 (41, 1),
 (43, 1),
 (44, 1),
 (45, 1),
 (51, 1),
 (53, 1),
 (57, 1),
 (60, 1),
 (65, 1),
 (68, 1)]

In [12]:
label_entities_map = defaultdict(list)
for entity in flatten_entities:
    label_entities_map[entity[0]].append(entity[-1])

In [13]:
for label, entities in label_entities_map.items():
    print(label, utils.LABEL_MEANING_MAP[label])
    entity_lengths = [len(entity) for entity in entities]
    lower_q=np.quantile(entity_lengths,0.25,interpolation='lower')#下四分位数
    higher_q=np.quantile(entity_lengths,0.75,interpolation='higher')#上四分位数
    mid_q=np.quantile(entity_lengths,0.5,interpolation='higher')#上四分位数
    print(lower_q, mid_q, higher_q)
    counter = Counter(entity_lengths)
    print(sorted(counter.items(), key=lambda x: x[0]))
    entities = sorted(list(set(entities)), key=len)
    with open(f"tmp/{label}_{utils.LABEL_MEANING_MAP[label]}.txt", "w") as f:
        f.writelines([entity + "\n" for entity in entities])

NHVI 受害人
3 3 3
[(1, 3), (2, 55), (3, 3021), (4, 18), (5, 4), (6, 2), (7, 1), (8, 1), (12, 1), (16, 1), (18, 1)]
NASI 被盗物品
4 6 10
[(1, 46), (2, 558), (3, 482), (4, 816), (5, 600), (6, 471), (7, 467), (8, 396), (9, 310), (10, 320), (11, 235), (12, 215), (13, 153), (14, 126), (15, 102), (16, 81), (17, 63), (18, 56), (19, 38), (20, 45), (21, 39), (22, 22), (23, 18), (24, 14), (25, 14), (26, 19), (27, 14), (28, 9), (29, 4), (30, 10), (31, 6), (32, 4), (33, 3), (35, 3), (36, 5), (37, 2), (38, 1), (39, 2), (40, 3), (43, 1), (44, 1), (45, 1), (51, 1), (53, 1), (57, 1), (60, 1), (65, 1), (68, 1)]
NO 组织机构
4 9 10
[(2, 1), (3, 15), (4, 216), (5, 25), (6, 94), (7, 13), (8, 29), (9, 199), (10, 33), (11, 48), (12, 64), (13, 18), (14, 8), (15, 11), (16, 8), (17, 9), (18, 3), (19, 1), (20, 4), (21, 1), (22, 3), (24, 2), (25, 1)]
NHCS 犯罪嫌疑人
3 3 3
[(1, 12), (2, 383), (3, 5908), (4, 98), (5, 26), (6, 14), (7, 5), (8, 5), (9, 9), (18, 3)]
NCGV 物品价值
5 7 8
[(2, 2), (3, 32), (4, 227), (5, 365), (6, 140), (7, 

In [14]:
# 实体重叠
type_count_map = defaultdict(int)
is_intersect = lambda a, b: min(a[1], b[1]) - max(a[0], b[0]) > 0
for entities in [line["entities"] for line in raw]:
    entities = sorted(entities, key=lambda x: (x[1], x[2]))
    # print(entities)
    num_entities = len(entities)
    for i in range(num_entities):
        ti, bi, ei, si = entities[i]
        for j in range(i + 1, num_entities):
            tj, bj, ej, sj = entities[j]
            if is_intersect((bi, ei), (bj, ej)):
                # if utils.LABEL_MEANING_MAP[ti] == "地点":
                #     print(entities[i], entities[j])
                type_count_map[(
                    utils.LABEL_MEANING_MAP[ti], 
                    utils.LABEL_MEANING_MAP[tj])] += 1
                # print(entities[i], entities[j])

In [15]:
sorted(type_count_map.items(), key=lambda x: x[0])

[(('受害人', '作案工具'), 1),
 (('受害人', '地点'), 177),
 (('受害人', '时间'), 1),
 (('受害人', '物品价值'), 1),
 (('受害人', '被盗物品'), 392),
 (('受害人', '被盗货币'), 51),
 (('地点', '作案工具'), 1),
 (('地点', '受害人'), 302),
 (('地点', '物品价值'), 1),
 (('地点', '犯罪嫌疑人'), 2),
 (('地点', '组织机构'), 7),
 (('地点', '被盗物品'), 3),
 (('地点', '被盗货币'), 2),
 (('物品价值', '被盗物品'), 1),
 (('犯罪嫌疑人', '作案工具'), 13),
 (('犯罪嫌疑人', '地点'), 1),
 (('犯罪嫌疑人', '被盗物品'), 11),
 (('犯罪嫌疑人', '被盗货币'), 2),
 (('组织机构', '地点'), 8),
 (('组织机构', '被盗物品'), 12),
 (('被盗物品', '作案工具'), 3),
 (('被盗物品', '受害人'), 25),
 (('被盗物品', '物品价值'), 79)]