In [1]:
import utils
from itertools import chain
from collections import Counter, defaultdict

In [2]:
raw = utils.load_raw("./data/信息抽取_第一阶段/xxcq_small.json")

## 文本长度统计

In [3]:
text_lengths = [len(line["text"]) for line in raw]

In [4]:
print(min(text_lengths), max(text_lengths))

7 439


## 标签统计

In [5]:
utils.LABEL_MEANING_MAP

{'NHCS': '犯罪嫌疑人',
 'NHVI': '受害人',
 'NCSM': '被盗货币',
 'NCGV': '物品价值',
 'NCSP': '盗窃获利',
 'NASI': '被盗物品',
 'NATS': '作案工具',
 'NT': '时间',
 'NS': '地点',
 'NO': '组织机构'}

In [6]:
entities = [line["entities"] for line in raw]
flatten_entities = list(chain(*entities))

In [7]:
entity_types = [entity[0] for entity in flatten_entities]

In [8]:
flatten_entities[0]

('NHCS', 3, 5, '朱某某')

In [9]:
Counter(entity_types)

Counter({'NHCS': 2935,
         'NHVI': 1299,
         'NASI': 2555,
         'NT': 1226,
         'NS': 1580,
         'NCGV': 947,
         'NO': 346,
         'NATS': 294,
         'NCSP': 186,
         'NCSM': 418})

In [10]:
entities_text_lengths = [len(entity[-1]) for entity in flatten_entities]

In [11]:
sorted(Counter(entities_text_lengths).items(), key=lambda x: x[0])

[(1, 22),
 (2, 565),
 (3, 4337),
 (4, 764),
 (5, 725),
 (6, 476),
 (7, 595),
 (8, 651),
 (9, 504),
 (10, 415),
 (11, 397),
 (12, 428),
 (13, 311),
 (14, 311),
 (15, 241),
 (16, 173),
 (17, 162),
 (18, 120),
 (19, 99),
 (20, 95),
 (21, 70),
 (22, 67),
 (23, 70),
 (24, 37),
 (25, 28),
 (26, 29),
 (27, 21),
 (28, 21),
 (29, 10),
 (30, 5),
 (31, 10),
 (32, 6),
 (33, 4),
 (34, 4),
 (35, 4),
 (36, 1),
 (37, 1),
 (38, 1),
 (39, 1),
 (40, 2),
 (44, 1),
 (45, 1),
 (57, 1)]

In [12]:
label_entities_map = defaultdict(list)
for entity in flatten_entities:
    label_entities_map[entity[0]].append(entity[-1])

In [13]:
for label, entities in label_entities_map.items():
    print(label, utils.LABEL_MEANING_MAP[label])
    print(Counter([len(entity) for entity in entities]))
    entities = sorted(list(set(entities)), key=len)
    with open(f"tmp/{label}_{utils.LABEL_MEANING_MAP[label]}.txt", "w") as f:
        f.writelines([entity + "\n" for entity in entities])

NHCS 犯罪嫌疑人
Counter({3: 2711, 2: 162, 4: 28, 5: 19, 9: 9, 7: 4, 1: 1, 6: 1})
NHVI 受害人
Counter({3: 1274, 2: 20, 4: 4, 8: 1})
NASI 被盗物品
Counter({4: 342, 5: 275, 2: 267, 7: 203, 3: 199, 6: 199, 8: 190, 10: 146, 9: 139, 11: 112, 12: 101, 13: 71, 14: 60, 17: 33, 15: 32, 18: 26, 16: 25, 1: 21, 21: 17, 20: 16, 19: 13, 23: 10, 26: 9, 22: 8, 27: 6, 28: 5, 25: 5, 24: 5, 30: 4, 32: 2, 35: 2, 40: 2, 29: 2, 31: 2, 33: 1, 39: 1, 38: 1, 44: 1, 45: 1, 57: 1})
NT 时间
Counter({12: 215, 14: 155, 13: 150, 11: 147, 10: 136, 15: 105, 9: 79, 16: 62, 17: 37, 7: 32, 8: 30, 18: 14, 5: 12, 6: 10, 22: 7, 4: 7, 3: 6, 2: 6, 19: 5, 20: 4, 23: 3, 24: 1, 26: 1, 21: 1, 32: 1})
NS 地点
Counter({5: 99, 14: 91, 17: 87, 15: 87, 6: 83, 19: 78, 18: 78, 16: 74, 4: 71, 12: 70, 13: 69, 11: 69, 20: 68, 9: 63, 7: 62, 23: 57, 8: 53, 22: 52, 21: 50, 10: 39, 24: 30, 3: 28, 25: 23, 2: 21, 26: 18, 28: 16, 27: 15, 31: 8, 29: 8, 34: 4, 32: 3, 33: 3, 30: 1, 37: 1, 36: 1})
NCGV 物品价值
Counter({8: 255, 5: 194, 7: 187, 4: 122, 9: 67, 6: 64, 11: 2