## 任务4：文本多路召回与重排序
## BM25+BGE

In [1]:
# 导入数据库
import jieba,json,pdfplumber
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from rank_bm25 import BM25Okapi
import torch
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# 加载训练数据

## 加载问题
questions = json.load(open("../data/Coggle比赛数据/汽车知识问答/questions.json",encoding="utf8"))

## 加载知识pdf
pdf = pdfplumber.open("../data/Coggle比赛数据/汽车知识问答/初赛训练数据集.pdf")

# 预处理,把每一页的页码和一整页的内容拼凑在一个json当中，且添加到list
pdf_content = []
for page_idx in range(len(pdf.pages)):
    pdf_content.append({
        "page": "page_"+str(page_idx+1), # 严格一一对应，第几页就是第几页的内容，
        "content": pdf.pages[page_idx].extract_text()
    })

pdf_content[-99] # 最后默认有一个页码，数的是256页，对应内容的页码也是256页，严格一一对应

{'page': 'page_256',
 'content': '空调\n空调除霜/除雾\n前除霜/除雾\n您可以通过空调系统面板启用前除霜/除雾功能，可迅速除去挡风玻\n璃上的结冰或雾气。\n01点击开启/关闭前除霜/除雾功能。\n警告！\n■ 为安全起见，驾驶前请确保前挡风玻璃无冰渣、积雪或冷凝水。\n说明！\n操作这个按钮可迅速除去挡风玻璃和车窗上的结冰或雾气。\n□ 前排智能空调触控屏中有些按键属于联动按键（例如：如果启用\n按下该按钮，启动最大除霜功能。该功能启用后，按钮上的指示灯点 了前除霜/除雾功能，则A/C会自动激活且自动关闭空调内循\n亮。 环）。\n□ 启用前除霜/除雾功能时，空调自动模式（AUTO）会开启最大风\n您也可以通过中央显示屏空调控制面板启用前除霜/除雾功能。 量。\n在中央显示屏上唤起空调控制界面。\n256'}

In [3]:
# bm25处理数据

## jieba：把每一页内容切分成单词,放在一个列表里，也就是大列表中包含354个小列表，每个小列表中包含很多个单词
pdf_content_words = [jieba.lcut(x["content"]) for x in pdf_content] # 
bm25 = BM25Okapi(pdf_content_words) # 喂给bm25的就是单词


# 结巴会把拆分的词放在一个列表
test = "你好我是你的爸爸我热爱打你"
result = jieba.lcut(test)
# result : ['你好', '我', '是', '你', '的', '爸爸', '我', '热爱', '打', '你']

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\yx140\AppData\Local\Temp\jieba.cache
Loading model cost 0.325 seconds.
Prefix dict has been built successfully.


In [4]:
# embedding 模型处理数据

## 加载模型
model = SentenceTransformer("BAAI/bge-small-zh-v1.5",device='cuda')

## 所有的问题放在一个列表，所有的内容也放在一个列表
questions_sentences = [x['question'] for x in questions] # 只有一个列表，列表中的一项就是一个问题
pdf_content_sentences = [x["content"] for x in pdf_content] # 只有一个列表，列表中的一项就是一页内容

# # embedding处理
question_embeddings = model.encode(questions_sentences,normalize_embeddings=True)
pdf_embeddings = model.encode(pdf_content_sentences,normalize_embeddings=True)

pdf_content_sentences[:5]

  attn_output = torch.nn.functional.scaled_dot_product_attention(


['欢迎\n感谢您选择了具有优良安全性、舒适性、动力性和经济性的Lynk&Co领克汽车。\n首次使用前请仔细、完整地阅读本手册内容，将有助于您更好地了解和使用车辆。\n本手册中的所有资料均为出版时的最新资料，但本公司将对产品进行不断的改进和优化，您所购的车辆可能与本手册中的描述有所不同，请以实际\n接收的车辆为准。\n如您有任何问题，或需要预约服务，请拨打电话4006-010101联系我们。您也可以开车前往Lynk&Co领克中心。\n在抵达之前，请您注意驾车安全。\n©领克汽车销售有限公司',
 '',
 '目录\n前言 设置尾门开启角度.....................................................34\n上车和下车\n本手册相关的重要信息.................................................11\n敬告用户.................................................................11\n联系Lynk&Co领克.....................................................12 车辆锁止/解锁状态....................................................39\n事件数据记录系统......................................................12 使用遥控钥匙解锁和闭锁.............................................40\n远程监控系统............................................................12 使用Lynk&CoApp解锁和闭锁.......................................42\n原厂精装附件、选装装备和改装......................................13 无钥匙进入系统........................................................42\n无线电设备...............

In [5]:
# 双路召回
from scipy.stats import rankdata #用于为数组中的元素分配排名，
for query_idx,feat in enumerate(question_embeddings):

    # embedding计算相似度并打分
    score1 = feat@pdf_embeddings.T 
    score2 = bm25.get_scores(jieba.lcut(questions[query_idx]['question'])) # jieba将问题分词处理
    
    score = rankdata(score1)+rankdata(score2)
    max_score_page_idx = score.argsort()[-1] + 1
    questions[query_idx]["reference"] = 'page_' + str(max_score_page_idx)
    


In [6]:
# 保存双路召回的结果
with open('../data/Coggle比赛数据/汽车知识问答/submit_bm25_bgesmall.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)

## BGE/Text Segment(语义模型配合文本分词)

In [7]:
# 导入库
import jieba, json, pdfplumber
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from rank_bm25 import BM25Okapi
import torch
from sentence_transformers import SentenceTransformer

## 加载问题
questions = json.load(open("../data/Coggle比赛数据/汽车知识问答/questions.json",encoding="utf8"))

## 加载知识pdf
pdf = pdfplumber.open("../data/Coggle比赛数据/汽车知识问答/初赛训练数据集.pdf")


In [8]:
# 切分文本
def split_text_fixed_size(text,chunk_size): # 拆分文本
    _ = [text[i : i+chunk_size] for i in range( 0 , len(text), chunk_size)]
    return _

pdf_content=[]
for page_idx in range(len(pdf.pages)):

    # 把每一页的所有文本提炼出来[]
    text = pdf.pages[page_idx].extract_text()
   
    # 每40句组成一个embedding组
    for chunk_text in split_text_fixed_size(text,40):
        pdf_content.append({
            'page':'page_'+str(page_idx + 1),
            'content':chunk_text
        })


In [9]:
for i in pdf_content:
    print(i["page"])

page_1
page_1
page_1
page_1
page_1
page_1
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_3
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4
page_4

In [10]:
# 加载模型

model = SentenceTransformer("BAAI/bge-small-zh-v1.5",device='cuda')

## 处理句子分词
questions_sentences = [x['question'] for x in questions]

# 之前x["content"]是一页的内容，现在是chunk_size为40的小段话，同时也有这段话的页码，也就是一个页码会对应好几段话
pdf_content_sentences = [x["content"] for x in pdf_content]

# # embedding处理
question_embeddings = model.encode(questions_sentences,normalize_embeddings=True)
pdf_embeddings = model.encode(pdf_content_sentences,normalize_embeddings=True)

len(pdf_content_sentences)
len(pdf_embeddings)
# 3743个小段，就有3743个向量

3743

In [11]:
# 召回评分
for query_idx, feat in enumerate(question_embeddings):
    score = feat @ pdf_embeddings.T
    max_score_page_idx = score.argsort()[-1] # 之前是idx一一对应page，现在是几个idx同一个page，找page方式自然不同
    questions[query_idx]['reference'] = pdf_content[max_score_page_idx]['page']

with open('../data/Coggle比赛数据/汽车知识问答/submit_bgesmall_Text_Segment.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)

## BM25 + BGE/Text Segment

In [12]:
import jieba, json, pdfplumber
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from rank_bm25 import BM25Okapi
import torch
from sentence_transformers import SentenceTransformer

questions = json.load(open("../data/Coggle比赛数据/汽车知识问答/questions.json",encoding="utf8"))
pdf = pdfplumber.open("../data/Coggle比赛数据/汽车知识问答/初赛训练数据集.pdf")
pdf_content = []

def split_text_fixed_size(text, chunk_size):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

for page_idx in range(len(pdf.pages)):
    text = pdf.pages[page_idx].extract_text()
    for chunk_text in split_text_fixed_size(text, 40):
        pdf_content.append({
            'page': 'page_' + str(page_idx + 1),
            'content': chunk_text
        })

pdf_content_words = [jieba.lcut(x['content']) for x in pdf_content]
bm25 = BM25Okapi(pdf_content_words)

model = SentenceTransformer('BAAI/bge-small-zh-v1.5',device='cuda')
question_sentences = [x['question'] for x in questions]
pdf_content_sentences = [x['content'] for x in pdf_content]

question_embeddings = model.encode(question_sentences, normalize_embeddings=True)
pdf_embeddings = model.encode(pdf_content_sentences, normalize_embeddings=True)

from scipy.stats import rankdata
for query_idx, feat in enumerate(question_embeddings):
    score1 = feat @ pdf_embeddings.T
    score2 = bm25.get_scores(jieba.lcut(questions[query_idx]["question"]))

    score = rankdata(score1) + rankdata(score2)
    max_score_page_idx = score.argsort()[-1]
    questions[query_idx]['reference'] = pdf_content[max_score_page_idx]['page']

with open('../data/Coggle比赛数据/汽车知识问答/submit_bm25_bgesmall_Text_Segment.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)

## BM25 with ReRank

In [13]:
import jieba, json, pdfplumber
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from rank_bm25 import BM25Okapi
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

questions = json.load(open("../data/Coggle比赛数据/汽车知识问答/questions.json",encoding="utf8"))
pdf = pdfplumber.open("../data/Coggle比赛数据/汽车知识问答/初赛训练数据集.pdf")
pdf_content = []
for page_idx in range(len(pdf.pages)):
    pdf_content.append({
        'page': 'page_' + str(page_idx + 1),
        'content': pdf.pages[page_idx].extract_text()
    })

tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-base')
rerank_model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-base')
rerank_model.cuda() # 显卡
rerank_model.eval() # 推理模式

pdf_content_words = [jieba.lcut(x['content']) for x in pdf_content]
bm25 = BM25Okapi(pdf_content_words)

for query_idx in range(len(questions)):
    doc_scores = bm25.get_scores(jieba.lcut(questions[query_idx]["question"]))
    max_score_page_idxs = doc_scores.argsort()[-3:] # 把每一个问题的前三个最匹配的content的index找出来

    # print(query_idx)
    # print(doc_scores)
    # print(max_score_page_idxs)
    pairs = []
    for idx in max_score_page_idxs:# 这三个分数最高的放到列表里
        pairs.append([questions[query_idx]["question"], pdf_content[idx]['content']])
    # print(pairs)

    # 每一个问题都会得到如下的结果
    #[["question","content1"],["question","content2"],["question","content3"]]


    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    # print(inputs)
    # exit()
    with torch.no_grad(): # 推理时不用计算梯度，可以节省内存，提高效率

        # 把'input_ids', 'attention_mask'的值放cuda里，各种形式结构未发生变化
        inputs = {key: inputs[key].cuda() for key in inputs.keys()}
        
        # rerank_model进行打分,
        # 结果：tensor([-6.4908, -5.9194, -2.6678], device='cuda:0')
        scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()

    # 选出分数最高的一项：2
    max_score_page_idx = max_score_page_idxs[scores.cpu().numpy().argmax()]
    questions[query_idx]['reference'] = 'page_' + str(max_score_page_idx + 1)

with open('../data/Coggle比赛数据/汽车知识问答/submit_bm25_rerangk.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)

## 任务5：文本多路召回与重排序
## YINKA & CONAN

In [14]:
## 导入库
import json
import pdfplumber
from sentence_transformers import SentenceTransformer

## 加载数据
questions = json.load(open("../data/Coggle比赛数据/汽车知识问答/questions.json",encoding="utf8"))
pdf = pdfplumber.open("../data/Coggle比赛数据/汽车知识问答/初赛训练数据集.pdf")

## 定义分本切分函数
def split_text_fixed_size(text,chunk_size): # 拆分文本
    return [text[i : i+chunk_size] for i in range( 0 , len(text), chunk_size)]

## 切分文本
pdf_content=[]
for page_idx in range(len(pdf.pages)):

    # 把每一页的所有文本提炼出来[]
    text = pdf.pages[page_idx].extract_text()
   
    # 每40句组成一个embedding组
    for chunk_text in split_text_fixed_size(text,60):
        pdf_content.append({
            'page':'page_'+str(page_idx + 1),
            'content':chunk_text
        })
pdf_content[:10]
model_names = ["TencentBAC/Conan-embedding-v1","Classical/Yinka","infgrad/stella-base-zh-v3-1792d"]
for i in range(len(model_names)):
    if i==0:
        a="conan"
    elif i==1:
        a="yinka"
    else:
        a="stella"
    ## 加载模型
    model = SentenceTransformer(model_names[i],device='cuda')

    # 处理句子分词
    questions_sentences = [x['question'] for x in questions]

    # 之前x["content"]是一页的内容，现在是chunk_size为40的小段话，同时也有这段话的页码，也就是一个页码会对应好几段话
    pdf_content_sentences = [x["content"] for x in pdf_content]

    # embedding处理
    question_embeddings = model.encode(questions_sentences,normalize_embeddings=True)
    pdf_embeddings = model.encode(pdf_content_sentences,normalize_embeddings=True)

    ## 召回评分
    for query_idx, feat in enumerate(question_embeddings):
        score = feat @ pdf_embeddings.T
        max_score_page_idx = score.argsort()[-3:] # 之前是idx一一对应page，现在是几个idx同一个page，找page方式自然不同
        result = [pdf_content[x]['page'] for x in max_score_page_idx]
        questions[query_idx]['reference'] = result


    ## 保存结果top10
    with open(f'../data/Coggle比赛数据/汽车知识问答/submit_{a}_top10.json', 'w', encoding='utf8') as up:
        json.dump(questions, up, ensure_ascii=False, indent=4)

Some weights of BertModel were not initialized from the model checkpoint at infgrad/stella-base-zh-v3-1792d and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# 重排序
import json
yinka = json.load(open("../data/Coggle比赛数据/汽车知识问答/submit_yinka_top10.json",encoding='utf-8'))
conan =json.load(open("../data/Coggle比赛数据/汽车知识问答/submit_conan_top10.json",encoding = 'utf8'))
stella =json.load(open("../data/Coggle比赛数据/汽车知识问答/submit_stella_top10.json",encoding = 'utf8'))

fusion_result = []
k=60
for q1,q2,q3 in zip(yinka,conan,stella):
    fusion_score = {}
    for idx,q in enumerate(q2['reference']):
        if q not in fusion_score:
            fusion_score[q]=1/(idx+k)
        else:
            fusion_score[q]+=1/(idx+1)
    for idx,q in enumerate(q2['reference']):
        if q not in fusion_score:
            fusion_score[q]=1/(idx+k)
        else:
            fusion_score[q]+=1/(idx+1) 
    for idx,q in enumerate(q3['reference']):
        if q not in fusion_score:
            fusion_score[q]=1/(idx+k)
        else:
            fusion_score[q]+=1/(idx+1)    
    sorted_dict=sorted(fusion_score.items(),key = lambda item:item[1],reverse=True)
    q1['reference']=sorted_dict[0][0]
    fusion_result.append(q1)

with open('../data/Coggle比赛数据/汽车知识问答/submit_fusion_yinka_stella_conan.json', 'w', encoding='utf8') as up:
    json.dump(fusion_result, up, ensure_ascii=False, indent=4)
