## Tutorial: 基于CPM-Bee进行字段匹配


### 1. 数据格式处理 (Process dataset)
训练之前，我们需要定义并处理我们的数据输入格式，我们构造一个数据集的处理类，将数据处理为特定格式。

Before training, we need to define and process our data input format. We construct a processing class for the data set to process the data into a specific format.

在本教程中，我们使用的情感分类的输入格式如下（也可以自行定义其他格式）：

In this tutorial, we use the following input format for emotion classification (you can also define other formats) :
```
数据表1: table_name_cn,table_name_en,table description
字段1: field_name_cn,field_name_en,field description
数据表2: table2
字段2: field2
"options": {
      "<option_0>": "表不匹配, 字段不匹配", 
      "<option_1>": "表匹配，字段不匹配",
      "<option_2>": "表匹配，字段匹配",
    }, 
question: "输入的数据表1与数据表2是否匹配，并且字段1与字段2是否匹配?"
<ans>: <option_0>
```

添加工作路径

Add working path

In [2]:
import random
import sys
import os
import json
random.seed(123)
sys.path.append("../../src")
sys.path.append("/data/nlp/llm/CPM/")


In [None]:
!pwd
!pip install bminf
#!wget --content-disposition https://cloud.tsinghua.edu.cn/f/bccfdb243eca404f8bf3/?dl=1
#!tar -zxvf SST-2.tar.gz

In [None]:
from cpm_live.tokenizers import CPMBeeTokenizer
from cpm_live.training_tasks.bee import FinetuneDataset
from cpm_live.models import CPMBeeConfig, CPMBeeTorch
import torch
import torch.nn.functional as F
import bmtrain as bmt
from copy import deepcopy
model_path = '/data/nlp/models/OpenBMB/cpm-bee-10b/'
config = CPMBeeConfig.from_json_file(model_path+"config.json")
ckpt_path = model_path+"/pytorch_model.bin"

In [None]:
tokenizer = CPMBeeTokenizer()
model = CPMBeeTorch(config=config)

In [None]:
tokenizer.tokenize('猫头鹰owl')

In [None]:
model.load_state_dict(torch.load(ckpt_path), strict=True)
device = torch.device("cuda:0")
model.to(device)

构建dataloader
build dataloader

构建runner

Build runner

In [None]:
from cpm_live.generation.bee import CPMBeeBeamSearch
data_list = [
    {"input1": "糖尿病该吃什么","input2": "糖尿病人的食谱是什么", "prompt": "input1和input2是否语义一致？","options": {
      "<option_0>": "不一致", 
      "<option_1>": "同义"      
    },  "<ans>": ""},
    {"input": "NGC 6231是一个位于天蝎座的疏散星团，天球座标为赤经16时54分，赤纬-41度48分，视觉观测大小约45角分，亮度约2.6视星等，距地球5900光年。NGC 6231年龄约为三百二十万年，是一个非常年轻的星团，星团内的最亮星是5等的天蝎座 ζ1星。用双筒望远镜或小型望远镜就能看到个别的行星。NGC 6231在1654年被意大利天文学家乔瓦尼·巴蒂斯特·霍迪尔纳（Giovanni Battista Hodierna）以Luminosae的名字首次纪录在星表中，但是未见记载于夏尔·梅西耶的天体列表和威廉·赫歇尔的深空天体目录。这个天体在1678年被爱德蒙·哈雷（I.7）、1745年被夏西亚科斯（Jean-Phillippe Loys de Cheseaux）（9）、1751年被尼可拉·路易·拉卡伊（II.13）分别再次独立发现。", "question": "NGC 6231被哪些人发现过？", "<ans>": ""}
]
# use beam search
beam_search = CPMBeeBeamSearch(
    model=model,
    tokenizer=tokenizer,
)
for data in data_list:
    inference_results = beam_search.generate([data], max_length=100, repetition_penalty=1.1)
    for res in inference_results:
        print(res)


### 召回和匹配

In [1]:
import sys
sys.path.append("../../src")
sys.path.append("/data/nlp/llm/CPM/")
sys.argv=['ipykernel_launcher.py',
          '--delta','/data/nlp/llm/CPM/CPM-Bee/src/results/cpm_bee_finetune-delta-best.pt',
          '--memory-limit','30',
          '--device','cuda:0',
          "--use-bminf"
         ]
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'


In [2]:
import json
import re
import random
import pymongo
import csv
import re
import collections
import sys
import torch
import gc

from build_embedding_index import *
import text_generation





In [None]:
args = text_generation.parse_args()
beam_search = text_generation.load_beam_search(args)
model = beam_search.model
tokenizer=beam_search.tokenizer
model.eval()
#if torch.cuda.device_count() > 1:
#	beam_search.model= torch.nn.DataParallel(model,device_ids=[0,1])


In [None]:
beam_search.model

In [4]:
"""
生成匹配结果：

"""


header = 'ods_dataset_cn,ods_data_cn,std_dataset_cn,std_data_cn,score,pred_match,human_match'.split(',')

PATH = r'C:/TEAM/贵州医药监管平台/'
PATH = r'data/'

db_uri = 'mongodb://172.16.29.84:2701/graph'
mongo_client = pymongo.MongoClient(db_uri)
db = mongo_client.get_default_database()

collection_name = 'meta_info'
collection = db[collection_name]

std_dataset = collections.OrderedDict()
with open(PATH+'标化模型.jsonl','r',encoding='utf-8') as fd:
    for line in fd.readlines():
        row = json.loads(line)
        id = row['full_name_cn']
        std_dataset[id] = row

std_keys = list(std_dataset.keys())

ods_dataset = read_ods_dataset(PATH+'贵州省医药监管平台ods模型.csv')

#所有匹配的字段： std+ods:1
matched_dict = {}
matched_table_dict = {}
matched_field_dict = {}
result_dict = {}
with open(PATH+'贵州省医药监管平台模型链路.jsonl','r',encoding='utf-8') as fd:
    for line in fd.readlines():
        row = json.loads(line)
        row['data_name_cn'] = remove_number_around(row['data_name_cn'])
        label = row['dataset_name_cn']+'.'+row['data_name_cn'] # ods
        dwd_links = row['dwd_link'].split('\n')

        for dwd_link in dwd_links:
            dwd_link = remove_brackets(dwd_link).strip(';').strip()
            if dwd_link in std_dataset:
                row_std = std_dataset[dwd_link]
                matched_dict[dwd_link+':'+label]=1
                matched_table_dict[row_std['dataset_name_cn']+':'+row['dataset_name_cn']]=1
                matched_field_dict[row_std['data_name_cn']+':'+row['data_name_cn']]=1



In [5]:
matched_table_dict

{'卫生事件入口活动信息:门(急)诊挂号表': 1,
 '门(急)诊病历记录:门(急)诊病历': 1,
 '门(急)诊处方明细表:西药处方': 1,
 '门(急)诊处方主表:西药处方': 1,
 '收费单据明细表:门(急)诊结算记录': 1,
 '收费单据主表:门(急)诊结算记录': 1,
 '诊断明细表:入院诊断': 1,
 '医嘱记录明细表:住院医嘱': 1,
 '医嘱记录主表:住院医嘱': 1,
 '收费单据明细表:住院费用明细': 1,
 '收费单据主表:住院费用明细': 1,
 '收费单据主表:住院结算': 1,
 '出院记录:出院小结': 1,
 '住院病案首页基本信息:住院病案首页': 1,
 '住院病案首页诊断信息:住院病案首页': 1,
 '住院病案首页手术信息:住院病案首页': 1,
 '住院病案首页费用信息:住院病案首页': 1,
 '辅助检查报告主表:检查记录': 1,
 '检验报告单主表:检验记录': 1,
 '检验结果明细表:检验记录': 1,
 '入院记录:入院记录': 1}

In [5]:
k=30
tp=1
tn=0
fp=0
fn=0
c=0
dataset = []
# key:ods
std_matched_dict = collections.defaultdict(list)

In [None]:
b = 4
if std_dataset:
    for row2 in std_dataset.values():
        if c>=0:
            # std
            if c>600:
                b=3
            row2 = build_text(row2)
            query = row2['text']
            query_en = row2['text_en']
            label = row2['dataset_name_cn']+'.'+row2['data_name_cn'] # std
            selector = dict(text_embedding={'$text':query,'$limit':k})
            result = collection.find(selector)
            query_results = collections.OrderedDict()
            for doc in result:
                id = doc['_id']
                ods_row = ods_dataset[id]
                text_embedding = doc['text_embedding']
                distance = doc['_meta']['searchScore']
                if distance<0.8:
                    data = build_question(ods_row,row2)
                    result_dict[label+':'+id] = build_ans(row2,ods_row,matched_dict,matched_field_dict,matched_table_dict)
                    data["<ans>"] = ''

                    query_results[id]= data

            selector = dict(text_en_embedding={'$text':query_en,'$limit':k})
            result = collection.find(selector)
            for doc in result:
                id = doc['_id']
                ods_row = ods_dataset[id]
                text_embedding = doc['text_embedding']
                distance = doc['_meta']['searchScore']
                if distance<0.8:
                    data = build_question(ods_row,row2)
                    result_dict[label+':'+id] = build_ans(row2,ods_row,matched_dict,matched_field_dict,matched_table_dict)
                    data["<ans>"] = ''

                    query_results[id]= data

            if query_results:
                found=0
                i = 0                
                labels = list(query_results.keys())
                datas = list(query_results.values())
                for index in range(0,len(datas),b):
                    query = datas[index:index+b]
                    inference_results = beam_search.generate(query, max_length=280, repetition_penalty=1)
                    #inference_results = query
                    for res in inference_results:
                        id = labels[i]
                        pred_label = res['<ans>']
                        true_label = result_dict[label+':'+id]
                        parts = id.split('.')
                        score = res['<score>']
                        if pred_label=='<option_3>':
                            if true_label == pred_label:
                                tp+=1
                            else:
                                fp+=1
                            data = dict(std_dataset_cn=row2['dataset_name_cn'],std_data_cn=row2['data_name_cn'],
                                        ods_dataset_cn=parts[0],ods_data_cn=parts[1],score=score,pred_match=pred_label,human_match=true_label)
                            std_matched_dict[id].append(data)
                            found+=1
                        elif true_label=='<option_3>':
                            if true_label != pred_label:
                                fn+=1
                                data = dict(std_dataset_cn=row2['dataset_name_cn'],std_data_cn=row2['data_name_cn'],
                                            ods_dataset_cn=parts[0],ods_data_cn=parts[1],score=score,pred_match=pred_label,human_match=true_label)
                                std_matched_dict[id].append(data)
                            else:
                                tn+=1
                        else:
                            tn+=1
                        i+=1

            c+=1
            if c%100==0:
                print('#tp='+str(tp))
                print('#tn='+str(tn))
                print('#fp='+str(fp))
                print('#fn='+str(fn))
                print('#total='+str(c))
                print(f'#Precision={tp/(tp+fp)}')
                print(f'#Recall={tp/(tp+fn)}')
                gc.collect()
                torch.cuda.empty_cache()



In [None]:

lends = len(dataset)
print('tp='+str(tp))
print('tn='+str(tn))
print('fp='+str(fp))
print('fn='+str(fn))
print(f'Precision={tp/(tp+fp)}')
print(f'Recall={tp/(tp+fn)}')
print('total='+str(c))

In [None]:
with open(PATH+'predict_error.jsonl','a',encoding='utf-8') as fout:
    for data in dataset:
        json.dump(data,fout,ensure_ascii=False)
        fout.write("\n")

In [None]:
data_list = [       
        {"document":"老鼠凶狠地指着猫说：我现在和蝙蝠结婚了！将来我们的孩子生活在空中！再也不怕你了！猫哈哈大笑，指了指树上的猫头鹰说：看见没，这是俺<mask_1>！",
         "<ans>":""},
        {"document":"老鼠凶狠地指着猫说：我现在和蝙蝠结婚了！将来我们的孩子生活在空中！再也不怕你了！猫哈哈大笑，指了指树上的<mask_0>说：看见没，这是俺<mask_1>！",
         "<ans>":{"<mask_0>": "","<mask_1>": ""}}
    ]
beam_search.generate(data_list, max_length=256, repetition_penalty=1.2)

In [None]:
data_list = [       
        {"address":"南岸窍角沱正街33号","prompt":"将地址里面的省市县区镇村识别出来",
         "<ans>":""},
        {"document":"老鼠凶狠地指着猫说：我现在和蝙蝠结婚了！将来我们的孩子生活在空中！再也不怕你了！猫哈哈大笑，指了指树上的<mask_0>说：看见没，这是俺<mask_1>！",
         "<ans>":{"<mask_0>": "","<mask_1>": ""}}
    ]
beam_search.generate(data_list, max_length=256, repetition_penalty=1.2)

In [None]:
os.environ

In [None]:
torch.cuda.get_device_name(0)

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
k=30
tp=1
tn=0
fp=0
fn=0
c=0
dataset = []

In [10]:
len(std_matched_dict)

157

In [8]:


print('tp='+str(tp))
print('tn='+str(tn))
print('fp='+str(fp))
print('fn='+str(fn))
print(f'Precision={tp/(tp+fp)}')
print(f'Recall={tp/(tp+fn)}')
print('total='+str(c))



with open(PATH+'std_predict_result.csv','w',encoding='utf-8',newline='') as fout:
    writer = csv.DictWriter(fout,header)
    writer.writeheader()
    for items in std_matched_dict.values():
        writer.writerows(items) 


with open(PATH+'ods_predict_result.csv', 'w', encoding='utf-8',newline='') as f:
    writer = csv.DictWriter(f,header)
    writer.writeheader()
    for row in ods_dataset.values():
        label = row['dataset_name_cn']+'.'+row['data_name_cn']
        if label in std_matched_dict:
            writer.writerows(std_matched_dict[label])
        else:
            writer.writerow(dict(ods_dataset_cn=row['dataset_name_cn'],ods_data_cn=row['data_name_cn'],
                                 std_dataset_cn='',std_data_cn='',pred_match='',human_match=''))
            

tp=780
tn=169437
fp=1463
fn=64
Precision=0.34774855104770397
Recall=0.9241706161137441
total=2988


In [None]:
list(std_matched_dict.values())

In [None]:
result_dict

In [None]:
f=filter(lambda x: x.startswith('卫生事件'),list(matched_dict.keys()))
for i in f:
    print(i)

In [12]:
tokenizer.encode('SCORE')

([80098, 50085], {})

In [4]:
doc1={"表1": "入院诊断,hospitalized diagnosis", "字段1": "入院记录id,admission id,关联入院记录", "表2": "入院记录,admission record", "字段2": "身份证件类别标准值,id type std,个体身份证件所属类别（如居民身份证、居民户口簿、护照等）在标准编码体系中的名称", "options": {"<option_0>": "表不匹配,字段不匹配", "<option_1>": "表匹配,字段不匹配", "<option_2>": "表不匹配,字段匹配", "<option_3>": "表匹配,字段匹配"}, "question": "需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）", "<ans>": ""}
doc2={"表1": "住院医嘱,hospitalized order", "字段1": "药物类型编码,type code,药物类型代码cv5301.06", "表2": "医嘱记录主表,order record master", "字段2": "医嘱类别标准码,order type code std,临床医嘱类别（如临时、长期、出院带药等）在标准编码体系中的代码", "options": {"<option_0>": "表不匹配,字段不匹配", "<option_1>": "表匹配,字段不匹配", "<option_2>": "表不匹配,字段匹配", "<option_3>": "表匹配,字段匹配"}, "question": "需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）", "<ans>": ""}
doc3={"表1": "住院医嘱,hospitalized order", "字段1": "药物类型编码,type code,药物类型代码cv5301.06", "表2": "医嘱记录主表,order record master", "字段2": "医疗机构原始编号,org id,医疗机构按照原始编码体系填写的唯一标识", "options": {"<option_0>": "表不匹配,字段不匹配", "<option_1>": "表匹配,字段不匹配", "<option_2>": "表不匹配,字段匹配", "<option_3>": "表匹配,字段匹配"}, "question": "需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）", "<ans>": ""}

In [5]:
beam_search.generate([doc1,doc2,doc3], max_length=256, repetition_penalty=1)

[{'表1': '入院诊断,hospitalized diagnosis',
  '字段1': '入院记录id,admission id,关联入院记录',
  '表2': '入院记录,admission record',
  '字段2': '身份证件类别标准值,id type std,个体身份证件所属类别（如居民身份证、居民户口簿、护照等）在标准编码体系中的名称',
  'options': {'<option_0>': '表不匹配,字段不匹配',
   '<option_1>': '表匹配,字段不匹配',
   '<option_2>': '表不匹配,字段匹配',
   '<option_3>': '表匹配,字段匹配'},
  'question': '需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）',
  '<ans>': '<option_0>',
  '<score>': -0.0034593939781188965},
 {'表1': '住院医嘱,hospitalized order',
  '字段1': '药物类型编码,type code,药物类型代码cv5301.06',
  '表2': '医嘱记录主表,order record master',
  '字段2': '医嘱类别标准码,order type code std,临床医嘱类别（如临时、长期、出院带药等）在标准编码体系中的代码',
  'options': {'<option_0>': '表不匹配,字段不匹配',
   '<option_1>': '表匹配,字段不匹配',
   '<option_2>': '表不匹配,字段匹配',
   '<option_3>': '表匹配,字段匹配'},
  'question': '需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）',
  '<ans>': '<option_1>',
  '<score>': -0.0008088350296020508},
 {'表1': '住院医嘱,hospitalized order',
  '字段1': '药物类型编码,type code,药物类型代码cv5301.06',
  '表2

In [37]:
doc1={"表1": "门(急)诊挂号表,registration record", "字段1": "挂号类别代码,reg type code,【规则】ct05.10.004?挂号类别", "表2": "卫生事件入口活动信息,hevent entrance", "字段2": "常住户籍类型原始码,resident type code，个体的常住地址是否为户籍所在地类别在原始编码体系中的代码", "options": {"<option_0>": "表不匹配,字段不匹配", "<option_1>": "表匹配,字段不匹配", "<option_2>": "表不匹配,字段匹配", "<option_3>": "表匹配,字段匹配"}, "question": "需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）", "<ans>": ""}
doc2={"表1": "门(急)诊挂号表,registration record", "字段1": "挂号类别代码,reg type code,【规则】ct05.10.004?挂号类别", "表2": "卫生事件入口活动信息,hevent entrance", "字段2": "挂号途径原始码,registration path code,患者挂号途径（如现场、预约、特诊等）在原始编码体系中的代码", "options": {"<option_0>": "表不匹配,字段不匹配", "<option_1>": "表匹配,字段不匹配", "<option_2>": "表不匹配,字段匹配", "<option_3>": "表匹配,字段匹配"}, "question": "需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）", "<ans>": ""}
doc3={"表1": "住院医嘱,hospitalized order", "字段1": "药物类型编码,type code,药物类型代码cv5301.06", "表2": "医嘱记录主表,order record master", "字段2": "医疗机构原始编号,org id,医疗机构按照原始编码体系填写的唯一标识", "options": {"<option_0>": "表不匹配,字段不匹配", "<option_1>": "表匹配,字段不匹配", "<option_2>": "表不匹配,字段匹配", "<option_3>": "表匹配,字段匹配"}, "question": "需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）", "<ans>": ""}

In [None]:
beam_search.generate([doc1,doc2,doc3], max_length=280, repetition_penalty=1)

In [51]:
model_inputs, others= beam_search._process_list([doc1,doc2,doc3])

In [7]:

c


684

In [18]:
evaluation2(beam_search,[doc1,doc2,doc3])

[tensor(nan, device='cuda:0', dtype=torch.float16),
 tensor(nan, device='cuda:0', dtype=torch.float16),
 tensor(nan, device='cuda:0', dtype=torch.float16)]

In [None]:
tokenizer=beam_search.tokenizer
tokenizer.decode(pred_ids[2].tolist())

In [None]:

tokenizer.decode(model_inputs["input"][2].tolist())

## 第二次检查


In [10]:
import collections
import hashlib
import re
import csv
test_dataset = []
input_file = PATH+'ods_predict_result.csv'
with open(input_file, 'r',encoding='utf-8') as f:
    # 创建csv阅读器
    reader = csv.DictReader(f)
    # 遍历文件中的每一行
    for row in reader:
        
        test_dataset.append(row)


In [None]:
test_dataset

In [None]:
tp=1
tn=0
fp=0
fn=0
c=0
import math

results = collections.OrderedDict()
if test_dataset:
    for test_row in test_dataset:
        if test_row['std_dataset_cn'] and test_row['pred_match']=='<option_3>': # and '常住户籍类型原始码'==test_row['std_data_cn']
            # std
            label_id = test_row['std_dataset_cn']+'.'+test_row['std_data_cn']
            row2 = std_dataset[label_id]

            #ods
            id = test_row['ods_dataset_cn']+'.'+test_row['ods_data_cn']
            ods_row = ods_dataset[id]

            data = build_question(ods_row,row2)
            true_label = build_ans(row2,ods_row,matched_dict,matched_field_dict,matched_table_dict)
            pred_label = test_row['pred_match']
            data["<ans>"] = pred_label

            query = [data]
            inference_props = [test_row]
            #inference_props = beam_search.generate(query, max_length=280, repetition_penalty=1)
            for prop in inference_props:
                parts = id.split('.')
                score = float(prop['score'])
                if '常住户籍类型原始码'==test_row['std_data_cn']:
                    print('常住户籍类型原始码:')
                    print(prop)
                test_row['score'] = math.exp(score)
                if score>-2:
                    if true_label == pred_label:
                        tp+=1
                    else:
                        fp+=1
                    if id in results:
                        results[id].append(test_row)
                    else:
                        results[id] = [test_row]

                else:
                    if true_label != pred_label:
                        fn+=1
                        print(test_row)
                    else:
                        tn+=1
            c+=1
            if c%100==0:
                print('#tp='+str(tp))
                print('#tn='+str(tn))
                print('#fp='+str(fp))
                print('#fn='+str(fn))
                print('#total='+str(c))
                print(f'#Precision={tp/(tp+fp)}')
                print(f'#Recall={tp/(tp+fn)}')

In [63]:
(1-0.001,math.exp(-0.001))

(0.999, 0.999000499833375)

In [16]:

with open(PATH+'ods_predict_matched_result_top_3.csv', 'w', encoding='utf-8',newline='') as fout:
    writer = csv.DictWriter(fout,header)
    writer.writeheader()
    for items in results.values():
        items.sort(key=lambda x: x['score'],reverse=True)
        items = filter(lambda x: x['score']>0.98,items)
        items = list(items)[0:3]
        writer.writerows(items)
        


In [9]:
doc1={"表1": "门(急)诊挂号表,registration record", "字段1": "挂号类别代码,reg type code,【规则】ct05.10.004?挂号类别",
      "表2": "卫生事件入口活动信息,hevent entrance", "字段2": "常住户籍类型原始码,resident type code，个体的常住地址是否为户籍所在地类别在原始编码体系中的代码",
      "options": {"<option_0>": "表不匹配,字段不匹配", "<option_1>": "表匹配,字段不匹配", "<option_2>": "表不匹配,字段匹配", "<option_3>": "表匹配,字段匹配"},
      "question": "需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）", "<ans>": "<option_0>"}
doc2={"表1": "门(急)诊挂号表,registration record", "字段1": "挂号类别代码,reg type code,【规则】ct05.10.004?挂号类别",
      "表2": "卫生事件入口活动信息,hevent entrance", "字段2": "挂号途径原始码,registration path code,患者挂号途径（如现场、预约、特诊等）在原始编码体系中的代码",
      "options": {"<option_0>": "表不匹配,字段不匹配", "<option_1>": "表匹配,字段不匹配", "<option_2>": "表不匹配,字段匹配", "<option_3>": "表匹配,字段匹配"},
      "question": "需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）", "<ans>": "<option_2>"}
doc3={"表1": "住院医嘱,hospitalized order", "字段1": "药物类型编码,type code,药物类型代码cv5301.06",
      "表2": "医嘱记录主表,order record master", "字段2": "医疗机构原始编号,org id,医疗机构按照原始编码体系填写的唯一标识",
      "options": {"<option_0>": "表不匹配,字段不匹配", "<option_1>": "表匹配,字段不匹配", "<option_2>": "表不匹配,字段匹配", "<option_3>": "表匹配,字段匹配"},
      "question": "需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）", "<ans>": "<option_0>"}
doc4 = {"input": "狂风暴雨，今天天气是真的","prompt":"续写一段话", "<ans>": ""}

query=[doc1,doc2,doc3,doc4]
inference_results = beam_search.generate(query, max_length=280, repetition_penalty=1)
print(inference_results)
inference_props = evaluation_props(beam_search,query)
print(inference_props)

[{'表1': '门(急)诊挂号表,registration record', '字段1': '挂号类别代码,reg type code,【规则】ct05.10.004?挂号类别', '表2': '卫生事件入口活动信息,hevent entrance', '字段2': '常住户籍类型原始码,resident type code，个体的常住地址是否为户籍所在地类别在原始编码体系中的代码', 'options': {'<option_0>': '表不匹配,字段不匹配', '<option_1>': '表匹配,字段不匹配', '<option_2>': '表不匹配,字段匹配', '<option_3>': '表匹配,字段匹配'}, 'question': '需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）', '<ans>': '<option_3>'}, {'表1': '门(急)诊挂号表,registration record', '字段1': '挂号类别代码,reg type code,【规则】ct05.10.004?挂号类别', '表2': '卫生事件入口活动信息,hevent entrance', '字段2': '挂号途径原始码,registration path code,患者挂号途径（如现场、预约、特诊等）在原始编码体系中的代码', 'options': {'<option_0>': '表不匹配,字段不匹配', '<option_1>': '表匹配,字段不匹配', '<option_2>': '表不匹配,字段匹配', '<option_3>': '表匹配,字段匹配'}, 'question': '需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）', '<ans>': '<option_3>'}, {'表1': '住院医嘱,hospitalized order', '字段1': '药物类型编码,type code,药物类型代码cv5301.06', '表2': '医嘱记录主表,order record master', '字段2': '医疗机构原始编号,org id,医疗机构按照原始编码体系填写的唯一标识', 'options': 

In [None]:
{'表1': '门(急)诊挂号表,registration record',
  '字段1': '挂号类别代码,reg type code,【规则】ct05.10.004?挂号类别',
  '表2': '卫生事件入口活动信息,hevent entrance',
  '字段2': '常住户籍类型原始码，resident_type_code，个体的常住地址是否为户籍所在地类别在原始编码体系中的代码',
  'options': {'<option_0>': '表不匹配,字段不匹配',
   '<option_1>': '表匹配,字段不匹配',
   '<option_2>': '表不匹配,字段匹配',
   '<option_3>': '表匹配,字段匹配'},
  'question': '需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）',
}
 {'表1': '门(急)诊挂号表,registration record', 
  '字段1': '挂号类别代码,reg type code,【规则】ct05.10.004?挂号类别', 
  '表2': '卫生事件入口活动信息,hevent entrance', 
  '字段2': '常住户籍类型原始码,resident type code,个体的常住地址是否为户籍所在地类别在原始编码体系中的代码', 
  'options': {'<option_0>': '表不匹配,字段不匹配', '<option_1>': '表匹配,字段不匹配', '<option_2>': '表不匹配,字段匹配', '<option_3>': '表匹配,字段匹配'}, 
  'question': '需要把表2的字段2映射到表1的字段1，判断表1和表2,字段1和字段2是否匹配？（语义相似或者有包含关系则认为匹配）', '<ans>': '<option_3>'}