In [14]:
import pandas as pd
import glob
import json
import numpy as np
from  tqdm import tqdm, trange
import jsonlines
import re
import math

In [15]:

def find_subtext_index(text, keywords):
    """

    Args:
        text: 出生后感染性肺炎可出现发热或体温不升
        keywords: keywords = ['发热', '体温不升', '反应差']

    Returns: [[11, 13]]

    """
    index_ls = []
    for keyword in keywords:
        escaped_keyword = re.escape(keyword)
        matches = re.finditer(escaped_keyword, text)
        # matches = re.finditer(keyword, text)
        indices = [match.start() for match in matches]
        for value in indices:
            index_ls.append([value, value+len(keyword)])

    # 使用集合进行去重
    unique_list = [list(x) for x in set(tuple(x) for x in index_ls)]
    return unique_list


def label_BMEO(text,label_index):
    """

    Args:
        text: 列表形式的文字  例如：['腺', '样', '体', '肥', '大', '是', '临', '床']
        label_index: 标签对应的下标位置 [[1,3],[2,5]]
    Returns:

    """
    label  = np.zeros(len(text),dtype=np.int8).tolist()
    for index in range(len(text)):
        for label_index_period in label_index:
            if index in range(label_index_period[0], label_index_period[1]):
                label[index] = 1
    text = pd.Series(text)
    label = pd.Series(label)
    new_pd = pd.DataFrame([text, label]).T
    new_pd.columns = ['text','label']
    new_pd.reset_index(inplace=True,drop=True)
    labels = new_pd['label'].values
    label_BMEO = []
    for i in range(len(labels)):
        three_values  = labels[i-1:i+2].tolist()
        if len(three_values) == 0:
            three_values = [0,labels[i],labels[i+1]]
        if three_values == [0,1,1]:
            label_BMEO.append('s-B')
        elif three_values == [1,1,1]:
            label_BMEO.append('s-M')
        elif three_values == [1,1,0]:
            label_BMEO.append('s-E')
        else:
            label_BMEO.append('o')
    new_pd['label_BMEO'] = pd.Series(label_BMEO)
    new_pd = new_pd[['text','label_BMEO']]
    return new_pd['text'].values, new_pd['label_BMEO']



def save_pandas_file(file_train,pandas_file_name='CMedCausal'):
    file_train.columns = ['text','label_BMEO']
    out_train = file_train[:int(len(file_train)*0.95)]
    out_dev = file_train[int(len(file_train)*0.95):int(len(file_train) * 0.99)]
    out_test = file_train[int(len(file_train)*0.99):]
    out_train.to_csv(f'dataset/data_train_test_dev/train_{pandas_file_name}_bmeo.csv',index=False)
    out_dev.to_csv(f'dataset/data_train_test_dev/dev_{pandas_file_name}_bmeo.csv',index=False)
    out_test.to_csv(f'dataset/data_train_test_dev/test_{pandas_file_name}_bmeo.csv',index=False)


In [38]:
data_ls = []
split_limit = 450
file_path = "dataset/医疗实体识别能用的数据/CMeIE-V2/CMeIE-V2_train.jsonl"
text_ls, label_index_ls = [],[]
for value in ['train', 'dev']:
    temp_path = file_path.split('_')
    temp_path_str = temp_path[0] + '_' + value + '.jsonl'
    print(temp_path_str)
    with open(temp_path_str, "r+", encoding="utf8") as f:
        for i, item in tqdm(enumerate(jsonlines.Reader(f))):
            text = item['text']
            sub_text_ls = []
            for value in item['spo_list']:
                try:
                    if value['object_type']['@value'] == '症状':
                        sub_text_ls.append(value['object']['@value'])
                except:
                    pass
            if len(text)>split_limit:
                # index_ls = find_subtext_index(text, sub_text_ls)
                juhao_ls = find_subtext_index(text, ['。'])
                juhao_ls = [value[1] for value in juhao_ls]
                # 找到比450的倍数小的， 将数据切分为比450小的数据片段,且这个切分线不能在标签的区间内
                split_pices = math.ceil(len(text)/ split_limit)
                split_bias = [0]
                for j in range(split_pices+1): # 找到切分的下标也就是偏置
                    for i, value in enumerate(juhao_ls):
                        if i+1 < len(juhao_ls) and juhao_ls[i] < split_limit*j and juhao_ls[i+1] > split_limit*j:
                            split_bias.append(value) # value 是需要切分的位置
                split_bias.append(len(text))
                for i, vlaue in enumerate(split_bias):
                    if i+1< len(split_bias):
                        temp_text = text[split_bias[i]: split_bias[i+1]]
                        index_ls = find_subtext_index(temp_text, sub_text_ls)
                        temp_text = list(temp_text)
                        temp_text,label_index = label_BMEO(temp_text, index_ls)
                        text_ls.append("".join(temp_text.tolist()))
                        label_index_ls.append("\t".join(label_index.tolist()))
            else:
                index_ls = find_subtext_index(text, sub_text_ls)
                text = list(text)
                text,label_index = label_BMEO(text, index_ls)
                text_ls.append("".join(text.tolist()))
                label_index_ls.append("\t".join(label_index.tolist()))


dataset/医疗实体识别能用的数据/CMeIE-V2/CMeIE-V2_train.jsonl


14339it [00:42, 338.19it/s]


dataset/医疗实体识别能用的数据/CMeIE-V2/CMeIE-V2_dev.jsonl


3585it [00:10, 342.31it/s]


In [39]:
CMeIE_V2_pd_file = pd.DataFrame([text_ls, label_index_ls]).T
CMeIE_V2_pd_file

Unnamed: 0,0,1
0,溶血性贫血@ * 获得性溶血性贫血可分为免疫性和非免疫性： * 自身抗体是免疫介导的溶血性贫...,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...
1,女孩中约80%的中枢性性早熟患儿为特发性性早熟。 根据性早熟的发病机制和病因，可将之分为中枢...,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...
2,类癌综合征@类癌综合征患者手术前应该开始输注奥曲肽以防止类癌瘤危象。,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...
3,帕金森病@### 轻度帕金森症 左旋多巴被认为是确定性治疗方法，而且研究表明它不会加速疾病进展。,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...
4,（二）反流样消化不良 突出的表现是胸骨后痛，胃灼热，反流。,o\to\to\ts-B\ts-E\to\to\to\to\to\to\to\to\to\t...
...,...,...
17919,脑炎@脑膜炎症患者会出现脑膜脑炎体征，如头痛、畏光、颈项强直。,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...
17920,4.当慢性再障在病程中病情恶化临床表现、血象及骨髓象与急性再障相同时，称为重型再障Ⅱ型（SA...,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...
17921,普通感冒@## 监测 多数患者不需要监护。普通感冒@如有明确慢性阻塞性肺疾病病史或既往肺炎病...,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...
17922,病毒性脑膜炎@腰椎穿刺可确诊病毒性脑膜炎，从而停止抗生素治疗。,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...


In [40]:
CMeIE_V2_pd_file.columns = ['text','label_BMEO']
save_pandas_file(CMeIE_V2_pd_file,'CMeIE_V2')

In [25]:
data_ls = []
file_path = "dataset/医疗实体识别能用的数据/IMCS-V2-DAC/IMCS-V2_train.json"
text_ls, label_index_ls = [],[]
split_limit = 450
for value in ['train', 'dev']:
    temp_path = file_path.split('_')
    temp_path_str = temp_path[0] + '_' + value + '.json'

    with open(temp_path_str, 'r', encoding='utf-8') as f:
        temp_data = json.load(f)
        key_id_ls = list(temp_data.keys())
        for key_id in tqdm(key_id_ls) :
            data_dict = temp_data[key_id]
            sub_text_ls = []
            text = ""
            for value in data_dict['dialogue']:
                text += value['sentence']
                sub_text_ls.extend(value['symptom_norm'])
            text = str(text)
            if len(text)>split_limit:
                # index_ls = find_subtext_index(text, sub_text_ls)
                juhao_ls = find_subtext_index(text, ['。'])
                juhao_ls = [value[1] for value in juhao_ls]
                # 找到比450的倍数小的， 将数据切分为比450小的数据片段,且这个切分线不能在标签的区间内
                split_pices = math.ceil(len(text)/ split_limit)
                split_bias = [0]
                for j in range(split_pices+1): # 找到切分的下标也就是偏置
                    for i, value in enumerate(juhao_ls):
                        if i+1 < len(juhao_ls) and juhao_ls[i] < split_limit*j and juhao_ls[i+1] > split_limit*j:
                            split_bias.append(value) # value 是需要切分的位置
                split_bias.append(len(text))
                for i, vlaue in enumerate(split_bias):
                    if i+1< len(split_bias):
                        temp_text = text[split_bias[i]: split_bias[i+1]]
                        temp_text = str(temp_text)
                        index_ls = find_subtext_index(temp_text, sub_text_ls)
                        temp_text = list(temp_text)
                        temp_text,label_index = label_BMEO(temp_text, index_ls)
                        text_ls.append("".join(temp_text.tolist()))
                        label_index_ls.append("\t".join(label_index.tolist()))
            else:

                index_ls = find_subtext_index(text, sub_text_ls)
                text = list(text)
                text,label_index = label_BMEO(text, index_ls)
                text_ls.append("".join(text.tolist()))
                label_index_ls.append("\t".join(label_index.tolist()))

dataset/医疗实体识别能用的数据/IMCS-V2-DAC/IMCS-V2_train.json


  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = p

dataset/医疗实体识别能用的数据/IMCS-V2-DAC/IMCS-V2_dev.json


  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = pd.Series(text)
  label = pd.Series(label)
  new_pd['label_BMEO'] = pd.Series(label_BMEO)
  text = p

In [35]:
IMCS_V2_pd_file = pd.DataFrame([text_ls, label_index_ls]).T
IMCS_V2_pd_file.columns = ['text','label_BMEO']
IMCS_V2_pd_file = IMCS_V2_pd_file[IMCS_V2_pd_file['text'] != '']

In [36]:
save_pandas_file(IMCS_V2_pd_file,'IMCS_V2')