In [164]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from  tqdm import tqdm
import math

In [165]:

def find_subtext_index(text, keywords):
    """

    Args:
        text: 出生后感染性肺炎可出现发热或体温不升
        keywords: keywords = ['发热', '体温不升', '反应差']

    Returns: [[11, 13]]

    """
    index_ls = []
    for keyword in keywords:
        escaped_keyword = re.escape(keyword)
        matches = re.finditer(escaped_keyword, text)
        # matches = re.finditer(keyword, text)
        indices = [match.start() for match in matches]
        for value in indices:
            index_ls.append([value, value+len(keyword)])

    # 使用集合进行去重
    unique_list = [list(x) for x in set(tuple(x) for x in index_ls)]
    unique_list.sort(key=lambda x: x[0])

    return unique_list


def label_BMEO(text,label_index):
    """

    Args:
        text: 列表形式的文字  例如：['腺', '样', '体', '肥', '大', '是', '临', '床']
        label_index: 标签对应的下标位置 [[1,3],[2,5]]  # 最后一个不要
    Returns:

    """
    label  = np.zeros(len(text),dtype=np.int8).tolist()
    for index in range(len(text)):
        for label_index_period in label_index:
            if index in range(label_index_period[0], label_index_period[1]):
                label[index] = 1
    text = pd.Series(text)
    label = pd.Series(label)
    new_pd = pd.DataFrame([text, label]).T
    new_pd.columns = ['text','label']
    new_pd.reset_index(inplace=True,drop=True)
    labels = new_pd['label'].values
    label_BMEO = []
    for i in range(len(labels)):
        three_values  = labels[i-1:i+2].tolist()
        if len(three_values) == 0:
            three_values = [0,labels[i],labels[i+1]]
        if three_values == [0,1,1]:
            label_BMEO.append('s-B')
        elif three_values == [1,1,1]:
            label_BMEO.append('s-M')
        elif three_values == [1,1,0]:
            label_BMEO.append('s-E')
        else:
            label_BMEO.append('o')
    new_pd['label_BMEO'] = pd.Series(label_BMEO)
    new_pd = new_pd[['text','label_BMEO']]
    return new_pd['text'].values, new_pd['label_BMEO']



def save_pandas_file(file_train,pandas_file_name='CMedCausal'):
    file_train.columns = ['text','label_BMEO']
    out_train = file_train[:int(len(file_train)*0.95)]
    out_dev = file_train[int(len(file_train)*0.95):int(len(file_train) * 0.99)]
    out_test = file_train[int(len(file_train)*0.99):]
    out_train.to_csv(f'dataset/data_train_test_dev/train_{pandas_file_name}_bmeo.csv',index=False)
    out_dev.to_csv(f'dataset/data_train_test_dev/dev_{pandas_file_name}_bmeo.csv',index=False)
    out_test.to_csv(f'dataset/data_train_test_dev/test_{pandas_file_name}_bmeo.csv',index=False)


In [166]:
def func(string):
    string = str(string)
    string = string.replace('，',',')
    string = string.replace('、',',')
    string = string.replace('。',',')
    if string[-1] == ',':
        return string[:-1]
    else:return string


In [167]:
file_excel = pd.read_excel("dataset/标注数据.xlsx")
file_excel['标注_1'] = file_excel['标注'].apply(func)
file_excel['length'] = file_excel['原文'].apply(lambda x: len(str(x)))


In [168]:
file_excel

Unnamed: 0,原文,标注,标注_1,length
0,颈部疼痛5月余5月前患者因工作原因长时间低头伏案工作后出现出现颈部疼痛，无言语及肢体不利等症...,颈部疼痛、高血压、糖尿病、冠心病、肝炎、结核、手术史、输血史、食物过敏史、药物过敏史、颈椎生...,"颈部疼痛,高血压,糖尿病,冠心病,肝炎,结核,手术史,输血史,食物过敏史,药物过敏史,颈椎生...",457
1,头晕伴左侧肢体麻木无力5天余5天前出现头晕昏沉不适，伴左侧肢体麻木无力，走路时向左侧倾斜心房...,头晕、左侧肢体麻木、无力、心房纤颤、冠心病、脑梗塞、神志清、精神尚可、营养中等、浮肿、巴氏征...,"头晕,左侧肢体麻木,无力,心房纤颤,冠心病,脑梗塞,神志清,精神尚可,营养中等,浮肿,巴氏征...",531
2,"头晕伴双下肢水肿半年,加重3天头晕伴双下肢水肿半年余,3天前发现双下肢水肿加重，伴下肢沉重无...",头晕、下肢水肿、高血压、糖尿病、眼睑浮肿、凹性水肿、足背动脉搏动、反射。,"头晕,下肢水肿,高血压,糖尿病,眼睑浮肿,凹性水肿,足背动脉搏动,反射",366
3,头晕、恶心伴左上肢麻木7天7天前颈肩不适，扭头头晕、恶心，颈椎回位头晕恶心减轻，平卧休息左手...,头晕、恶心、左上肢麻木、颈肩不适、颈椎回位、左手小指、无名指麻木、颈椎病、高血压病、冠心病、...,"头晕,恶心,左上肢麻木,颈肩不适,颈椎回位,左手小指,无名指麻木,颈椎病,高血压病,冠心病,...",517
4,全身多处关节疼痛伴双侧上肢麻木1月全身多处关节疼痛伴双侧上肢麻木1月既往有“类风湿性关节炎”...,类风湿性关节炎、颈椎间盘突出症、关节疼痛、麻木、神志、语言流利、精神、皮肤、淋巴结、眼睑、颈...,"类风湿性关节炎,颈椎间盘突出症,关节疼痛,麻木,神志,语言流利,精神,皮肤,淋巴结,眼睑,颈...",424
...,...,...,...,...
20005,发现肉眼血尿伴排尿困难5天。患者儿子代诉1月前患者曾因“左肾占位性病变”在焦作市第二人民医院...,肉眼血尿、排尿困难、左肾占位性病变、高血压、糖尿病、高血脂病史、心脏病、脑血管病、精神病史、...,"肉眼血尿,排尿困难,左肾占位性病变,高血压,糖尿病,高血脂病史,心脏病,脑血管病,精神病史,...",1006
20006,头晕、头懵、胸闷、右上肢无力3天3天前出现发作性头晕、头懵、胸闷、右上肢无力，呈发作性心肌供...,头晕、头懵、胸闷、右上肢无力、心肌供血不足、胃炎、胆囊炎、心律不整、上腹部压疼、右上肢肌力、...,"头晕,头懵,胸闷,右上肢无力,心肌供血不足,胃炎,胆囊炎,心律不整,上腹部压疼,右上肢肌力,...",227
20007,头晕3天，加重伴双上肢乏力、酸困1天头晕加重，伴头疼，伴双上肢乏力、酸困、麻木，伴晕沉感，伴...,头晕、上肢乏力、酸困、头疼、麻木、晕沉感、纳差、高血压、脑梗死。,"头晕,上肢乏力,酸困,头疼,麻木,晕沉感,纳差,高血压,脑梗死",102
20008,间断头晕无明显诱因出现头晕无无随当地社会进行无无无出生于本地22岁结婚无否认有家族遗传病史发...,头晕、家族遗传病史、发育正常、黄染、肿大、胸廓、压痛、叩压痛、心肌缺血。,"头晕,家族遗传病史,发育正常,黄染,肿大,胸廓,压痛,叩压痛,心肌缺血",88


In [169]:
file_excel.describe([.6, .7, .8,.9])  # max——length  800 大于的拆分

Unnamed: 0,length
count,20010.0
mean,458.625137
std,271.302456
min,7.0
50%,418.0
60%,484.0
70%,555.3
80%,642.0
90%,784.0
max,2352.0


In [170]:
text_ls, label_index_ls = [],[]
split_limit = 450
for value in tqdm(file_excel[['原文','标注_1']].values):
    text , sub_text = value[0],value[1]
    # 此处若文本大于500， 按照句号拆分，且句号要求是在
    sub_text_ls = str(sub_text).split(",")

    if len(text)>split_limit:
        # index_ls = find_subtext_index(text, sub_text_ls)
        juhao_ls = find_subtext_index(text, ['。'])
        juhao_ls = [value[1] for value in juhao_ls]
        # 找到比450的倍数小的， 将数据切分为比450小的数据片段,且这个切分线不能在标签的区间内
        split_pices = math.ceil(len(text)/ split_limit)
        split_bias = [0]
        for j in range(split_pices+1): # 找到切分的下标也就是偏置
            for i, value in enumerate(juhao_ls):
                if i+1 < len(juhao_ls) and juhao_ls[i] < split_limit*j and juhao_ls[i+1] > split_limit*j:
                    split_bias.append(value) # value 是需要切分的位置
        split_bias.append(len(text))
        for i, vlaue in enumerate(split_bias):
            if i+1< len(split_bias):
                temp_text = text[split_bias[i]: split_bias[i+1]]
                index_ls = find_subtext_index(temp_text, sub_text_ls)
                temp_text = list(temp_text)
                temp_text,label_index = label_BMEO(temp_text, index_ls)
                text_ls.append("".join(temp_text.tolist()))
                label_index_ls.append("\t".join(label_index.tolist()))
    else:
        index_ls = find_subtext_index(text, sub_text_ls)
        text = list(text)
        text,label_index = label_BMEO(text, index_ls)
        text_ls.append("".join(text.tolist()))
        label_index_ls.append("\t".join(label_index.tolist()))


100%|██████████| 20010/20010 [04:09<00:00, 80.35it/s] 


In [171]:
Row_pd_file = pd.DataFrame([text_ls, label_index_ls]).T
Row_pd_file

Unnamed: 0,0,1
0,颈部疼痛5月余5月前患者因工作原因长时间低头伏案工作后出现出现颈部疼痛，无言语及肢体不利等症...,s-B\ts-M\ts-M\ts-E\to\to\to\to\to\to\to\to\to\...
1,颈椎DR：颈椎生理曲度变直：各颈椎椎体骨质疏松，诸颈椎椎体前后缘上下角骨质增生：诸颈椎椎间隙...,s-B\ts-M\ts-M\ts-E\to\ts-B\ts-M\ts-M\ts-M\ts-M...
2,头晕伴左侧肢体麻木无力5天余5天前出现头晕昏沉不适，伴左侧肢体麻木无力，走路时向左侧倾斜心房...,s-B\ts-E\to\ts-B\ts-M\ts-M\ts-M\ts-M\ts-M\ts-M...
3,四肢关节活动可，左侧肢体肌力3级，右侧肢体肌力4级，双下肢轻度浮肿。左巴氏征阳性，余生理性反...,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...
4,"头晕伴双下肢水肿半年,加重3天头晕伴双下肢水肿半年余,3天前发现双下肢水肿加重，伴下肢沉重无...",s-B\ts-E\to\to\ts-B\ts-M\ts-M\ts-E\to\to\to\to...
...,...,...
29070,肠鸣音正常，无血管杂音，活动度正常活动度正常病理反射未引出腹部平坦，未见胃肠型及蠕动波，腹壁...,o\to\to\to\to\to\to\to\to\to\to\to\to\to\to\to...
29071,头晕、头懵、胸闷、右上肢无力3天3天前出现发作性头晕、头懵、胸闷、右上肢无力，呈发作性心肌供...,s-B\ts-E\to\ts-B\ts-E\to\ts-B\ts-E\to\ts-B\ts-...
29072,头晕3天，加重伴双上肢乏力、酸困1天头晕加重，伴头疼，伴双上肢乏力、酸困、麻木，伴晕沉感，伴...,s-B\ts-E\to\to\to\to\to\to\to\ts-B\ts-M\ts-M\t...
29073,间断头晕无明显诱因出现头晕无无随当地社会进行无无无出生于本地22岁结婚无否认有家族遗传病史发...,o\to\ts-B\ts-E\to\to\to\to\to\to\to\ts-B\ts-E\...


In [172]:
Row_pd_file.columns = ['text','label_BMEO']
save_pandas_file(Row_pd_file,'Row')