## 任务5：文本多路召回与重排序
## YINKA & CONAN

In [17]:
## 导入库
import json
import pdfplumber
from sentence_transformers import SentenceTransformer

## 加载数据
questions = json.load(open("../data/Coggle比赛数据/汽车知识问答/questions.json",encoding="utf8"))
pdf = pdfplumber.open("../data/Coggle比赛数据/汽车知识问答/初赛训练数据集.pdf")

## 定义分本切分函数
def split_text_fixed_size(text,chunk_size): # 拆分文本
    return [text[i : i+chunk_size] for i in range( 0 , len(text), chunk_size)]

## 切分文本
pdf_content=[]
for page_idx in range(len(pdf.pages)):

    # 把每一页的所有文本提炼出来[]
    text = pdf.pages[page_idx].extract_text()
   
    # 每40句组成一个embedding组
    for chunk_text in split_text_fixed_size(text,60):
        pdf_content.append({
            'page':'page_'+str(page_idx + 1),
            'content':chunk_text
        })
pdf_content[:10]
model_names = ["TencentBAC/Conan-embedding-v1","Classical/Yinka","infgrad/stella-base-zh-v3-1792d"]
for i in range(len(model_names)):
    if i==0:
        a="conan"
    elif i==1:
        a="yinka"
    else:
        a="stella"
    ## 加载模型
    model = SentenceTransformer(model_names[i],device='cuda')

    # 处理句子分词
    questions_sentences = [x['question'] for x in questions]

    # 之前x["content"]是一页的内容，现在是chunk_size为40的小段话，同时也有这段话的页码，也就是一个页码会对应好几段话
    pdf_content_sentences = [x["content"] for x in pdf_content]

    # embedding处理
    question_embeddings = model.encode(questions_sentences,normalize_embeddings=True)
    pdf_embeddings = model.encode(pdf_content_sentences,normalize_embeddings=True)

    ## 召回评分
    for query_idx, feat in enumerate(question_embeddings):
        score = feat @ pdf_embeddings.T
        max_score_page_idx = score.argsort()[-3:] # 之前是idx一一对应page，现在是几个idx同一个page，找page方式自然不同
        result = [pdf_content[x]['page'] for x in max_score_page_idx]
        questions[query_idx]['reference'] = result


    ## 保存结果top10
    with open(f'../data/Coggle比赛数据/汽车知识问答/submit_{a}_top10.json', 'w', encoding='utf8') as up:
        json.dump(questions, up, ensure_ascii=False, indent=4)

Some weights of BertModel were not initialized from the model checkpoint at infgrad/stella-base-zh-v3-1792d and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# 重排序
import json
yinka = json.load(open("../data/Coggle比赛数据/汽车知识问答/submit_yinka_top10.json",encoding='utf-8'))
conan =json.load(open("../data/Coggle比赛数据/汽车知识问答/submit_conan_top10.json",encoding = 'utf8'))
stella =json.load(open("../data/Coggle比赛数据/汽车知识问答/submit_stella_top10.json",encoding = 'utf8'))

fusion_result = []
k=60
for q1,q2,q3 in zip(yinka,conan,stella):
    fusion_score = {}
    for idx,q in enumerate(q2['reference']):
        if q not in fusion_score:
            fusion_score[q]=1/(idx+k)
        else:
            fusion_score[q]+=1/(idx+1)
    for idx,q in enumerate(q2['reference']):
        if q not in fusion_score:
            fusion_score[q]=1/(idx+k)
        else:
            fusion_score[q]+=1/(idx+1) 
    for idx,q in enumerate(q3['reference']):
        if q not in fusion_score:
            fusion_score[q]=1/(idx+k)
        else:
            fusion_score[q]+=1/(idx+1)    
    sorted_dict=sorted(fusion_score.items(),key = lambda item:item[1],reverse=True)
    q1['reference']=sorted_dict[0][0]
    fusion_result.append(q1)

with open('../data/Coggle比赛数据/汽车知识问答/submit_fusion_yinka_stella_conan.json', 'w', encoding='utf8') as up:
    json.dump(fusion_result, up, ensure_ascii=False, indent=4)
