In [1]:
import pandas as pd
import glob
import numpy as np
from tqdm import tqdm, trange
import re

In [2]:

def find_subtext_index(text, keywords):
    """

    Args:
        text: 出生后感染性肺炎可出现发热或体温不升
        keywords: keywords = ['发热', '体温不升', '反应差']

    Returns: [[11, 13]]

    """
    index_ls = []
    for keyword in keywords:
        escaped_keyword = re.escape(keyword)
        matches = re.finditer(escaped_keyword, text)
        # matches = re.finditer(keyword, text)
        indices = [match.start() for match in matches]
        for value in indices:
            index_ls.append([value, value+len(keyword)])

    # 使用集合进行去重
    unique_list = [list(x) for x in set(tuple(x) for x in index_ls)]
    return unique_list


def label_BMEO(text,label_index):
    """

    Args:
        text: 列表形式的文字  例如：['腺', '样', '体', '肥', '大', '是', '临', '床']
        label_index: 标签对应的下标位置 [[1,3],[2,5]]
    Returns:

    """
    label  = np.zeros(len(text),dtype=np.int8).tolist()
    for index in range(len(text)):
        for label_index_period in label_index:
            if index in range(label_index_period[0], label_index_period[1]+1):
                label[index] = 1
    text = pd.Series(text)
    label = pd.Series(label)
    new_pd = pd.DataFrame([text, label]).T
    new_pd.columns = ['text','label']
    new_pd.reset_index(inplace=True,drop=True)
    labels = new_pd['label'].values
    label_BMEO = []
    for i in range(len(labels)):
        three_values  = labels[i-1:i+2].tolist()
        if len(three_values) == 0:
            three_values = [0,labels[i],labels[i+1]]
        if three_values == [0,1,1]:
            label_BMEO.append('s-B')
        elif three_values == [1,1,1]:
            label_BMEO.append('s-M')
        elif three_values == [1,1,0]:
            label_BMEO.append('s-E')
        else:
            label_BMEO.append('o')
    new_pd['label_BMEO'] = pd.Series(label_BMEO)
    new_pd = new_pd[['text','label_BMEO']]
    return new_pd['text'].values, new_pd['label_BMEO']



def save_pandas_file(file_train,pandas_file_name='CMedCausal'):
    file_train.columns = ['text','label_BMEO']
    out_train = file_train[:int(len(file_train)*0.8)]
    out_dev = file_train[int(len(file_train)*0.8):int(len(file_train) * 0.9)]
    out_test = file_train[int(len(file_train)*0.9):]
    out_train.to_csv(f'dataset/data_train_test_dev/train_{pandas_file_name}_bmeo.csv',index=False)
    out_dev.to_csv(f'dataset/data_train_test_dev/dev_{pandas_file_name}_bmeo.csv',index=False)
    out_test.to_csv(f'dataset/data_train_test_dev/test_{pandas_file_name}_bmeo.csv',index=False)


In [3]:
all_txt_original = glob.glob("dataset/data_1/*/*.txtoriginal.txt")
all_txt_original = list(set(all_txt_original))
text_ls, label_index_ls = [],[]
for path_ in tqdm(all_txt_original):
    try:
        original_txt = pd.read_csv(path_,header=None).values[0,0]
        res = path_.split('.')
        string_res = res[0]+"."+res[-1]
        temp_file = pd.read_csv(string_res,delimiter='\t',header=None)
        temp_file = temp_file.loc[temp_file[3] == '症状和体征']
        label_index = temp_file[[1,2]].values.tolist()
        text = list(original_txt)
        text,label_index = label_BMEO(text, label_index)
        text_ls.append("".join(text.tolist()))
        label_index_ls.append("\t".join(label_index.tolist()))
    except:
        pass


100%|██████████| 1198/1198 [00:10<00:00, 109.87it/s]


In [5]:
# save_pandas_file(CMedCausal_pd_file,'CMedCausal')
hospital_pd_file.columns = ['text','label_BMEO']
save_pandas_file(hospital_pd_file,'hospital')

In [4]:
hospital_pd_file = pd.DataFrame([text_ls, label_index_ls]).T
hospital_pd_file

Unnamed: 0,0,1
0,患儿入院第6天，偶有咳嗽，无发热，食欲、睡眠尚可，二便正常。查体：T: 36.5℃，神志清...,o\to\to\to\to\to\to\to\to\to\ts-B\ts-E\to\to\t...
1,患者诉右侧阴囊疼痛不明显，无腹痛，腹胀，无尿频，尿急，无发热，饮食及二便正常。查体：神清，精...,o\to\to\to\to\to\to\ts-B\ts-E\to\to\to\to\to\t...
2,1.患者老年男性，慢性发病，病史1年。既往患高血压病史6年，前列腺炎病史6年，否认肝炎及结核...,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...
3,患儿入院第10天，咳嗽、咳痰明显减轻，无发热。食欲、睡眠尚可，二便正常。查体：T: 36....,o\to\to\to\to\to\to\to\to\ts-B\ts-E\to\to\to\t...
4,患者无心悸气短，左侧肢体活动不灵减轻，查体：Bp130/80mmHg,o\to\to\ts-B\ts-M\ts-M\ts-E\to\to\to\to\to\to\...
...,...,...
1161,患儿住院第6天，无发热，未见丘疹，偶有咳嗽。睡眠饮食尚可，二便正常。查体：T: 36.8℃...,o\to\to\to\to\to\to\to\to\ts-B\ts-E\to\to\to\t...
1162,6岁，生于河北省承德市，现住河北省承德市，主因外伤后头痛30分钟于2016-10-14入院。,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...
1163,病例特点：1、患者老年女性75岁，既往有脑出血症病史2年，未遗留后遗症。慢性胃炎病史5 年间...,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...
1164,男，4岁，河北省承德市双滦区御祥园1号楼4单元408人，主因咳嗽、咳痰1个月，加重伴发热3天...,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...


12