In [1]:
import utils
from itertools import chain
from collections import Counter, defaultdict

In [2]:
raw = utils.load_raw("./data/信息抽取_第一阶段/xxcq_small.json")

## 文本长度统计

In [3]:
text_lengths = [len(line["text"]) for line in raw]

In [4]:
print(min(text_lengths), max(text_lengths))

7 439


## 标签统计

In [5]:
utils.LABEL_MEANING_MAP

{'NHCS': '犯罪嫌疑人',
 'NHVI': '受害人',
 'NCSM': '被盗货币',
 'NCGV': '物品价值',
 'NCSP': '盗窃获利',
 'NASI': '被盗物品',
 'NATS': '作案工具',
 'NT': '时间',
 'NS': '地点',
 'NO': '组织机构'}

In [6]:
entities = [line["entities"] for line in raw]
flatten_entities = list(chain(*entities))

In [7]:
entity_types = [entity[0] for entity in flatten_entities]

In [8]:
flatten_entities[0]

('NHCS', 3, 5, '朱某某')

In [9]:
Counter(entity_types)

Counter({'NHCS': 2935,
         'NHVI': 1299,
         'NASI': 2555,
         'NT': 1226,
         'NS': 1580,
         'NCGV': 947,
         'NO': 346,
         'NATS': 294,
         'NCSP': 186,
         'NCSM': 418})

In [10]:
entities_text_lengths = [len(entity[-1]) for entity in flatten_entities]

In [11]:
sorted(Counter(entities_text_lengths).items(), key=lambda x: x[0])

[(1, 22),
 (2, 565),
 (3, 4337),
 (4, 764),
 (5, 725),
 (6, 476),
 (7, 595),
 (8, 651),
 (9, 504),
 (10, 415),
 (11, 397),
 (12, 428),
 (13, 311),
 (14, 311),
 (15, 241),
 (16, 173),
 (17, 162),
 (18, 120),
 (19, 99),
 (20, 95),
 (21, 70),
 (22, 67),
 (23, 70),
 (24, 37),
 (25, 28),
 (26, 29),
 (27, 21),
 (28, 21),
 (29, 10),
 (30, 5),
 (31, 10),
 (32, 6),
 (33, 4),
 (34, 4),
 (35, 4),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 2),
 (44, 1),
 (45, 1),
 (57, 1)]

In [12]:
label_entities_map = defaultdict(list)
for entity in flatten_entities:
    label_entities_map[entity[0]].append(entity[-1])

In [13]:
for label, entities in label_entities_map.items():
    print(label, utils.LABEL_MEANING_MAP[label])
    counter = Counter([len(entity) for entity in entities])
    print(sorted(counter.items(), key=lambda x: x[0]))
    entities = sorted(list(set(entities)), key=len)
    with open(f"tmp/{label}_{utils.LABEL_MEANING_MAP[label]}.txt", "w") as f:
        f.writelines([entity + "\n" for entity in entities])

NHCS 犯罪嫌疑人
[(1, 1), (2, 162), (3, 2711), (4, 28), (5, 19), (6, 1), (7, 4), (9, 9)]
NHVI 受害人
[(2, 20), (3, 1274), (4, 4), (8, 1)]
NASI 被盗物品
[(1, 21), (2, 267), (3, 199), (4, 342), (5, 275), (6, 199), (7, 203), (8, 190), (9, 139), (10, 146), (11, 112), (12, 101), (13, 71), (14, 60), (15, 32), (16, 25), (17, 33), (18, 26), (19, 13), (20, 16), (21, 17), (22, 8), (23, 10), (24, 5), (25, 5), (26, 9), (27, 6), (28, 5), (29, 2), (30, 4), (31, 2), (32, 2), (33, 1), (35, 2), (38, 1), (39, 1), (40, 2), (44, 1), (45, 1), (57, 1)]
NT 时间
[(2, 6), (3, 6), (4, 7), (5, 12), (6, 10), (7, 32), (8, 30), (9, 79), (10, 136), (11, 147), (12, 215), (13, 150), (14, 155), (15, 105), (16, 62), (17, 37), (18, 14), (19, 5), (20, 4), (21, 1), (22, 7), (23, 3), (24, 1), (26, 1), (32, 1)]
NS 地点
[(2, 21), (3, 28), (4, 71), (5, 99), (6, 83), (7, 62), (8, 53), (9, 63), (10, 39), (11, 69), (12, 70), (13, 69), (14, 91), (15, 87), (16, 74), (17, 87), (18, 78), (19, 78), (20, 68), (21, 50), (22, 52), (23, 57), (24, 30), (25