In [1]:
import os
import random
import copy
import re
import json
import jsonlines
import numpy as np
import pandas as pd
from tqdm import tqdm
import numpy as np
from collections import Counter

In [2]:
key_columns = ["input", "target", "answer_choices", "task_type", "task_dataset", "sample_id"]
templates = [
    "临床发现事件抽取：\\n[INPUT_TEXT]\\n说明：临床发现事件的主体词包含发生状态，描述词和解剖部位这三种属性，其中描述词和解剖部位可能有多个值\\n答：",
    "找出指定的临床发现事件属性：\\n[INPUT_TEXT]\\n事件抽取说明：临床发现事件由主体词，发生状态，描述词和解剖部位组成\\n答：",
    "找出句子中的临床发现事件及其属性：\\n [INPUT_TEXT]\\n说明：临床发现事件的主体词包含发生状态，描述词和解剖部位这三种属性，其中描述词和解剖部位可能有多个值\\n答：",
    "[INPUT_TEXT]\\n问题：句子中的临床发现事件及其属性是什么？\\n说明：临床发现事件由主体词，发生状态，描述词和解剖部位组成\\n答：",
    "生成句子中的临床发现事件属性是：\\n[INPUT_TEXT]\\n说明：临床发现事件的主体词包含发生状态，描述词和解剖部位这三种属性，其中描述词和解剖部位可能有多个值\\n答：",
    "[INPUT_TEXT]\\n这个句子里面临床发现事件是？\\n说明：临床发现事件由主体词，发生状态，描述词和解剖部位组成\\n答：",
    "临床发现事件抽取：[INPUT_TEXT]\\n说明：临床发现事件的主体词包含发生状态，描述词和解剖部位这三种属性，其中描述词和解剖部位可能有多个值\\n答："
  ]

In [3]:
def read_data(data_path):
    '''read data from jsonlines file'''
    data = []

    with jsonlines.open(data_path, "r") as f:
        for meta_data in f:
            data.append(meta_data)

    return data

In [4]:
def save_data(data_path, data):
    '''write all_data list to a new jsonl'''
    with jsonlines.open(data_path, "w") as w:
        for meta_data in data:
            w.write(meta_data)

In [5]:
def get_re(templates):
    '''get the re formula for input and labels'''
    re_temp = []

    for temp in templates:

        re_temp.append(temp.replace("[INPUT_TEXT]", "([\s\S]*)").replace("\\n答：", ""))

    return re_temp

In [6]:
train_data = read_data("train.json")
temp_re = get_re(templates)
input_list, answer_list = [], []
anti_input = [] # check the sample that can't match the template

for meta_data in train_data:
    flag = True
    for temp in temp_re:
        if re.match(temp, meta_data["input"]):
            input_list.append(re.match(temp, meta_data["input"])[1])
            answer_list.append(meta_data["target"])
            flag = False
            break
    if flag:    
        anti_input.append(meta_data["input"])

In [7]:
len(input_list), len(set(input_list)), len(answer_list), len(set(answer_list))

(3000, 1361, 3000, 1339)

In [8]:
new_input_list, new_answer_list = [], []

for i in range(len(input_list)):
    if input_list[i] not in new_input_list:
        new_input_list.append(input_list[i])
        new_answer_list.append(answer_list[i])

print("New data size: %d" % len(new_input_list))

New data size: 1361


In [9]:
def get_input(raw_input, n_re=2):
    temp_list = random.sample(templates, k=n_re)
    res = []
    for meta_temp in temp_list:
        new_input = copy.deepcopy(meta_temp)
        res.append(new_input.replace("[INPUT_TEXT]", raw_input))
    return res

In [10]:
new_data = []
num_per = 2
count = 0
for i in range(len(new_input_list)):
    meta_input_list = get_input(new_input_list[i])
    for meta_input in meta_input_list:
        meta_data = {"input": meta_input, "target": new_answer_list[i], "answer_choices": "null",
                     "task_type": "event_extraction", "task_dataset": "CHIP-CDEE", "sample_id": "train-"+str(count)}
        new_data.append(meta_data)
        count+=1

In [11]:
len(new_data)

2722

In [12]:
save_data("train_norep.json", new_data)

In [13]:
raw_data = json.load(open("CHIP-CDEE_train.json"))
answer_temp = "\n主体词：[CORE]；发生状态：[TEND]；描述词：[CHARA]；解剖部位：[ANAT]"
aug_input_list, aug_answer_list = [], []

for meta_d in raw_data:
    meta_aug_input = meta_d["text"]
    meta_aug_target = "上述句子中的临床发现事件如下："
    
    for meta_event in meta_d["event"]:
        meta_event_str = copy.deepcopy(answer_temp)
        meta_event_str = meta_event_str.replace("[CORE]", meta_event["core_name"])
        meta_event_str = meta_event_str.replace("[TEND]", meta_event["tendency"])
        chara_str = ""
        if len(meta_event["character"]) > 0:
            for j, meta_chara in enumerate(meta_event["character"]):
                if j == (len(meta_event["character"]) - 1):
                    chara_str = chara_str + meta_chara
                else:
                    chara_str = chara_str + meta_chara + "，"
        meta_event_str = meta_event_str.replace("[CHARA]", chara_str)
        anat_str = ""
        if len(meta_event["anatomy_list"]) > 0:
            for j, meta_anat in enumerate(meta_event["anatomy_list"]):
                if j == (len(meta_event["anatomy_list"]) - 1):
                    anat_str = anat_str + meta_anat
                else:
                    anat_str = anat_str + meta_anat + "，"
        meta_event_str = meta_event_str.replace("[ANAT]", anat_str)
        
        meta_aug_target += meta_event_str
    
    aug_input_list.append(meta_aug_input)
    aug_answer_list.append(meta_aug_target)
    

In [14]:
len(set(input_list)), len(set(aug_input_list)), len(set(input_list+aug_input_list))

(1361, 1587, 1587)

In [15]:
input_list = new_input_list + aug_input_list
answer_list = new_answer_list + aug_answer_list
new_input_list, new_answer_list = [], []

for i in range(len(input_list)):
    if input_list[i] not in new_input_list:
        new_input_list.append(input_list[i])
        new_answer_list.append(answer_list[i])

print("New data size: %d" % len(new_input_list))

New data size: 1587


In [16]:
new_data = []
num_per = 2
count = 0
for i in range(len(new_input_list)):
    meta_input_list = get_input(new_input_list[i])
    for meta_input in meta_input_list:
        meta_data = {"input": meta_input, "target": new_answer_list[i], "answer_choices": "null",
                     "task_type": "event_extraction", "task_dataset": "CHIP-CDEE", "sample_id": "train-"+str(count)}
        new_data.append(meta_data)
        count+=1

In [17]:
save_data("train_aug.json", new_data)