## 任务4：文本多路召回与重排序
## BM25+BGE

In [35]:
# 导入数据库
import jieba,json,pdfplumber
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from rank_bm25 import BM25Okapi
import torch
from sentence_transformers import SentenceTransformer

In [None]:
# 加载训练数据

## 加载问题
questions = json.load(open("../data/Coggle比赛数据/汽车知识问答/questions.json",encoding="utf8"))

## 加载知识pdf
pdf = pdfplumber.open("../data/Coggle比赛数据/汽车知识问答/初赛训练数据集.pdf")

# 预处理,把每一页的页码和一整页的内容拼凑在一个json当中，且添加到list
pdf_content = []
for page_idx in range(len(pdf.pages)):
    pdf_content.append({
        "page": "page_"+str(page_idx+1), # 严格一一对应，第几页就是第几页的内容，
        "content": pdf.pages[page_idx].extract_text()
    })

pdf_content[-99] # 最后默认有一个页码，数的是256页，对应内容的页码也是256页，严格一一对应

In [37]:
# bm25处理数据

## jieba：把每一页内容切分成单词,放在一个列表里，也就是大列表中包含354个小列表，每个小列表中包含很多个单词
pdf_content_words = [jieba.lcut(x["content"]) for x in pdf_content] # 
bm25 = BM25Okapi(pdf_content_words) # 喂给bm25的就是单词


# 结巴会把拆分的词放在一个列表
test = "你好我是你的爸爸我热爱打你"
result = jieba.lcut(test)
# result : ['你好', '我', '是', '你', '的', '爸爸', '我', '热爱', '打', '你']

In [None]:
# embedding 模型处理数据

## 加载模型
model = SentenceTransformer("BAAI/bge-small-zh-v1.5",device='cuda')

## 所有的问题放在一个列表，所有的内容也放在一个列表
questions_sentences = [x['question'] for x in questions] # 只有一个列表，列表中的一项就是一个问题
pdf_content_sentences = [x["content"] for x in pdf_content] # 只有一个列表，列表中的一项就是一页内容

# # embedding处理
question_embeddings = model.encode(questions_sentences,normalize_embeddings=True)
pdf_embeddings = model.encode(pdf_content_sentences,normalize_embeddings=True)

pdf_content_sentences[:5]

In [39]:
# 双路召回
from scipy.stats import rankdata #用于为数组中的元素分配排名，
for query_idx,feat in enumerate(question_embeddings):

    # embedding计算相似度并打分
    score1 = feat@pdf_embeddings.T 
    score2 = bm25.get_scores(jieba.lcut(questions[query_idx]['question'])) # 合乎处理
    
    score = rankdata(score1)+rankdata(score2)
    max_score_page_idx = score.argsort()[-1] + 1
    questions[query_idx]["reference"] = 'page_' + str(max_score_page_idx)
    


In [40]:
# 保存双路召回的结果
with open('../data/Coggle比赛数据/汽车知识问答/submit_bm25_bgesmall.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)

## BGE/Text Segment(语义模型配合文本分词)

In [41]:
# 导入库
import jieba, json, pdfplumber
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from rank_bm25 import BM25Okapi
import torch
from sentence_transformers import SentenceTransformer

## 加载问题
questions = json.load(open("../data/Coggle比赛数据/汽车知识问答/questions.json",encoding="utf8"))

## 加载知识pdf
pdf = pdfplumber.open("../data/Coggle比赛数据/汽车知识问答/初赛训练数据集.pdf")


In [42]:
# 切分文本
def split_text_fixed_size(text,chunk_size): # 拆分文本
    _ = [text[i : i+chunk_size] for i in range( 0 , len(text), chunk_size)]
    return _

pdf_content=[]
for page_idx in range(len(pdf.pages)):

    # 把每一页的所有文本提炼出来[]
    text = pdf.pages[page_idx].extract_text()
   
    # 每40句组成一个embedding组
    for chunk_text in split_text_fixed_size(text,40):
        pdf_content.append({
            'page':'page_'+str(page_idx + 1),
            'content':chunk_text
        })


In [None]:
for i in pdf_content:
    print(i["page"])

In [None]:
# 加载模型

model = SentenceTransformer("BAAI/bge-small-zh-v1.5",device='cuda')

## 处理句子分词
questions_sentences = [x['question'] for x in questions]

# 之前x["content"]是一页的内容，现在是chunk_size为40的小段话，同时也有这段话的页码，也就是一个页码会对应好几段话
pdf_content_sentences = [x["content"] for x in pdf_content]

# # embedding处理
question_embeddings = model.encode(questions_sentences,normalize_embeddings=True)
pdf_embeddings = model.encode(pdf_content_sentences,normalize_embeddings=True)

len(pdf_content_sentences)
len(pdf_embeddings)
# 3743个小段，就有3743个向量

In [45]:
# 召回评分
for query_idx, feat in enumerate(question_embeddings):
    score = feat @ pdf_embeddings.T
    max_score_page_idx = score.argsort()[-1] # 之前是idx一一对应page，现在是几个idx同一个page，找page方式自然不同
    questions[query_idx]['reference'] = pdf_content[max_score_page_idx]['page']

with open('../data/Coggle比赛数据/汽车知识问答/submit_bgesmall_Text_Segment.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)

## BM25 + BGE/Text Segment

In [47]:
import jieba, json, pdfplumber
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from rank_bm25 import BM25Okapi
import torch
from sentence_transformers import SentenceTransformer

questions = json.load(open("../data/Coggle比赛数据/汽车知识问答/questions.json",encoding="utf8"))
pdf = pdfplumber.open("../data/Coggle比赛数据/汽车知识问答/初赛训练数据集.pdf")
pdf_content = []

def split_text_fixed_size(text, chunk_size):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

for page_idx in range(len(pdf.pages)):
    text = pdf.pages[page_idx].extract_text()
    for chunk_text in split_text_fixed_size(text, 40):
        pdf_content.append({
            'page': 'page_' + str(page_idx + 1),
            'content': chunk_text
        })

pdf_content_words = [jieba.lcut(x['content']) for x in pdf_content]
bm25 = BM25Okapi(pdf_content_words)

model = SentenceTransformer('BAAI/bge-small-zh-v1.5',device='cuda')
question_sentences = [x['question'] for x in questions]
pdf_content_sentences = [x['content'] for x in pdf_content]

question_embeddings = model.encode(question_sentences, normalize_embeddings=True)
pdf_embeddings = model.encode(pdf_content_sentences, normalize_embeddings=True)

from scipy.stats import rankdata
for query_idx, feat in enumerate(question_embeddings):
    score1 = feat @ pdf_embeddings.T
    score2 = bm25.get_scores(jieba.lcut(questions[query_idx]["question"]))

    score = rankdata(score1) + rankdata(score2)
    max_score_page_idx = score.argsort()[-1]
    questions[query_idx]['reference'] = pdf_content[max_score_page_idx]['page']

with open('../data/Coggle比赛数据/汽车知识问答/submit_bm25_bgesmall_Text_Segment.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)

## BM25 with ReRank

In [28]:
import jieba, json, pdfplumber
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from rank_bm25 import BM25Okapi
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

questions = json.load(open("../data/Coggle比赛数据/汽车知识问答/questions.json",encoding="utf8"))
pdf = pdfplumber.open("../data/Coggle比赛数据/汽车知识问答/初赛训练数据集.pdf")
pdf_content = []
for page_idx in range(len(pdf.pages)):
    pdf_content.append({
        'page': 'page_' + str(page_idx + 1),
        'content': pdf.pages[page_idx].extract_text()
    })

tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-base')
rerank_model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-base')
rerank_model.cuda() # 显卡
rerank_model.eval() # 推理模式

pdf_content_words = [jieba.lcut(x['content']) for x in pdf_content]
bm25 = BM25Okapi(pdf_content_words)

for query_idx in range(len(questions)):
    doc_scores = bm25.get_scores(jieba.lcut(questions[query_idx]["question"]))
    max_score_page_idxs = doc_scores.argsort()[-3:] # 把每一个问题的前三个最匹配的content的index找出来

    # print(query_idx)
    # print(doc_scores)
    # print(max_score_page_idxs)
    pairs = []
    for idx in max_score_page_idxs:# 这三个分数最高的放到列表里
        pairs.append([questions[query_idx]["question"], pdf_content[idx]['content']])
    # print(pairs)

    # 每一个问题都会得到如下的结果
    #[["question","content1"],["question","content2"],["question","content3"]]


    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    # print(inputs)
    # exit()
    with torch.no_grad(): # 推理时不用计算梯度，可以节省内存，提高效率

        # 把'input_ids', 'attention_mask'的值放cuda里，各种形式结构未发生变化
        inputs = {key: inputs[key].cuda() for key in inputs.keys()}
        
        # rerank_model进行打分,
        # 结果：tensor([-6.4908, -5.9194, -2.6678], device='cuda:0')
        scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()

    # 选出分数最高的一项：2
    max_score_page_idx = max_score_page_idxs[scores.cpu().numpy().argmax()]
    questions[query_idx]['reference'] = 'page_' + str(max_score_page_idx + 1)

with open('../data/Coggle比赛数据/汽车知识问答/submit_bm25_rerangk.json', 'w', encoding='utf8') as up:
    json.dump(questions, up, ensure_ascii=False, indent=4)