## 任务4：文本多路召回与重排序
## BM25+BGE

In [1]:
# 导入数据库
import jieba,json,pdfplumber
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from rank_bm25 import BM25Okapi
import torch
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [2]:
# 加载训练数据

## 加载问题
questions = json.load(open("../data/Coggle比赛数据/汽车知识问答/questions.json",encoding="utf8"))

## 加载知识pdf
pdf = pdfplumber.open("../data/Coggle比赛数据/汽车知识问答/初赛训练数据集.pdf")

# 预处理
pdf_content = []
for page_idx in range(len(pdf.pages)):
    pdf_content.append({
        "page": "page_"+str(page_idx+1),
        "content": pdf.pages[page_idx].extract_text()
    })

pdf_content[-1] # 最后默认有一个页码

{'page': 'page_354',
 'content': '技术资料\n公制术语\n术语 说明\n术语 说明\nN 牛\n% 百分比 Nm 牛米\nX:1 比值 V 伏特\n℃ 摄氏温度 W 瓦特\nAh 安时 kPa 千帕\nm 米 kW 千瓦\ncm 厘米\nmm 毫米\ng 克\nkg 千克\nh 小时\nmin 分钟\ns 秒\nrpm 每分钟转数\nkm/h 千米每小时\nL 升\nmL 毫升\n354'}

In [3]:
# bm25处理数据

## jieba切分成单词
pdf_content_words = [jieba.lcut(x["content"]) for x in pdf_content]

# bm25 训练
bm25 = BM25Okapi(pdf_content_words) # 喂给bm25的就是单词

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\yx140\AppData\Local\Temp\jieba.cache
Loading model cost 0.311 seconds.
Prefix dict has been built successfully.


In [19]:
# embedding 模型处理数据

## 加载模型
model = SentenceTransformer("BAAI/bge-small-zh-v1.5",device='cuda')

## 处理句子分词
questions_sentences = [x['question'] for x in questions]
pdf_content_sentences = [x["content"] for x in pdf_content]

# # embedding处理
question_embeddings = model.encode(questions_sentences,normalize_embeddings=True)
pdf_embeddings = model.encode(pdf_content_sentences,normalize_embeddings=True)
pdf_content_sentences[789]

'感器。请勿使用玻璃清洁剂清洁内后视镜。\n胎压监测系统\n您的车辆配备主动式胎压监测'

In [5]:
# 双路召回
from scipy.stats import rankdata #用于为数组中的元素分配排名，

for query_idx,feat in enumerate(question_embeddings):

    # embedding计算相似度并打分
    score1 = feat@pdf_embeddings.T 
    score2 = bm25.get_scores(jieba.lcut(questions[query_idx]['question']))
    
    score = rankdata(score1)+rankdata(score2)
    max_score_page_idx = score.argsort()[-1] + 1
    questions[query_idx]["reference"] = 'page_' + str(max_score_page_idx)
    


In [6]:
# 保存双路召回的结果
with open('../data/Coggle比赛数据/汽车知识问答/submit_bm25_bgesmall.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)

## BGE/Text Segment

In [8]:
# 导入库
import jieba, json, pdfplumber
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from rank_bm25 import BM25Okapi
import torch
from sentence_transformers import SentenceTransformer

## 加载问题
questions = json.load(open("../data/Coggle比赛数据/汽车知识问答/questions.json",encoding="utf8"))

## 加载知识pdf
pdf = pdfplumber.open("../data/Coggle比赛数据/汽车知识问答/初赛训练数据集.pdf")


In [10]:
# 切分 文本
def split_text_fixed_size(text,chunk_size): # 将[]中的多条做一个向量化，并非之前一句做一个向量化
    return [text[i : i+chunk_size] for i in range( 0 , len(text), chunk_size)]

pdf_content=[]
for page_idx in range(len(pdf.pages)):

    # 把每一页的所有文本提炼出来[]
    text = pdf.pages[page_idx].extract_text()

    # 每40句组成一个embedding组
    for chunk_text in split_text_fixed_size(text,40):
        pdf_content.append({
            'page':'page_'+str(page_idx + 1),
            'content':chunk_text
        })

In [18]:
# 加载模型

model = SentenceTransformer("BAAI/bge-small-zh-v1.5",device='cuda')

## 处理句子分词
questions_sentences = [x['question'] for x in questions]
pdf_content_sentences = [x["content"] for x in pdf_content] #pdf_content里现在是一段，而不是一个句子了

# # embedding处理
question_embeddings = model.encode(questions_sentences,normalize_embeddings=True)
pdf_embeddings = model.encode(pdf_content_sentences,normalize_embeddings=True)

pdf_content_sentences[789]

'感器。请勿使用玻璃清洁剂清洁内后视镜。\n胎压监测系统\n您的车辆配备主动式胎压监测'

In [None]:
# 召回评分
for query_idx,feat in enumerate(question_embeddings):
    score = feat@pdf_embeddings.T
    max_score_page_idx = score.argmax()[-1]
    questions[query_idx]['reference'] = pdf_content[max_score_page_idx]['page']