# 加载数据集

In [26]:
from datasets import load_dataset
import jieba
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn.metrics import ndcg_score

In [27]:
data = load_dataset('C-MTEB/DuRetrieval')
qrels = load_dataset('C-MTEB/DuRetrieval-qrels')

In [28]:
data

DatasetDict({
    corpus: Dataset({
        features: ['id', 'text'],
        num_rows: 100001
    })
    queries: Dataset({
        features: ['id', 'text'],
        num_rows: 2000
    })
})

In [29]:
qrels

DatasetDict({
    dev: Dataset({
        features: ['qid', 'pid', 'score'],
        num_rows: 9839
    })
})

In [30]:
qrels_pd = pd.DataFrame(qrels['dev'])
qrels_dict = qrels_pd.groupby('qid')['pid'].apply(list).to_dict()
qrels_dict

{'0014452c4edd2dee9b40b0f78bfaf8e4': ['a2635150b429b5f2e7f6a19161e5cd48',
  '6b1ab3ee329e26780b85c6b9d7a6802c',
  '94575ac389d80de88128c3f1788d395b',
  '1c37837640973fbe28c1035fb7dcac61',
  '2e23a09ebd69974932d653b3d39b31c7',
  '373bbf909da9704d2973f2b0411cedf1',
  '2d9f2b19a60b45ef24e4b265bbb05847'],
 '001a1445571eb22b00b899f7a431988d': ['43cf0e73e8f92c2ffa3dbaf2cd718176',
  '30d02ef813942ae81da43dbaf6edd6ed',
  '03712065d5cc89855c6d23ed78163336',
  '966a3ea48c601714703ed795115f3961',
  '4b187e8405c95fb421999286ab7122c9',
  'a4dd3b8b9b3549010bcb595af06de3fd',
  'dec02b84a98cea8f8c3c032816c6ddd1'],
 '002da0e5525d4841658b3b57a1c04b9b': ['1b4d333e09d3265b2e7fc94c10be1c55',
  '9441f87127c3d70449e6d4a87cb6eec1',
  '56f3cb86e5dece094fedcbdfba7b79ea',
  '7f09c13d7ca6e7e0d4a63e5add5e7388'],
 '002dd675c4bdacea74137639dd4146b8': ['f543979ffff9df4c017b7c9f64fac57a',
  '995adfe0f5bcd3ba84451f0ad5f3a642',
  '4359a7cb8654246f0ac016ffd8d8c33c',
  '142d6b6595be6795024ca6a512daf31a',
  '1d95ae029b0a9e

# BM25 (gensim)

In [31]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, OkapiBM25Model
from gensim.similarities import SparseMatrixSimilarity
import numpy as np
corpus = [
    "今天 很开心",
    "明天 也 很开心",
    "Python is a programming language"
]
corpus = [doc.lower().split() for doc in corpus]#[["Hello", "world"], ["bar", "bar"], ["foo", "bar"]]
dictionary = Dictionary(corpus)
bm25_model = OkapiBM25Model(dictionary=dictionary)
bm25_corpus = bm25_model[list(map(dictionary.doc2bow, corpus))]
bm25_index = SparseMatrixSimilarity(bm25_corpus, num_docs=len(corpus), num_terms=len(dictionary),
                                   normalize_queries=False, normalize_documents=False)
query = ["learn", '今天']
tfidf_model = TfidfModel(dictionary=dictionary, smartirs='bnn')  # Enforce binary weighting of queries
tfidf_query = tfidf_model[dictionary.doc2bow(query)]
similarities = bm25_index[tfidf_query]
print(similarities)
best_document = corpus[np.argmax(similarities)]
" ".join(best_document)

[0.62295806 0.         0.        ]


'今天 很开心'

In [32]:
corpus = [
    jieba.lcut(x['text']) for x in data['corpus']
]
dictionary = Dictionary(corpus)
bm25_model = OkapiBM25Model(dictionary=dictionary)
bm25_corpus = bm25_model[list(map(dictionary.doc2bow, corpus))]
bm25_index = SparseMatrixSimilarity(bm25_corpus, num_docs=len(corpus), num_terms=len(dictionary),
                                   normalize_queries=False, normalize_documents=False)

In [33]:
# https://radimrehurek.com/gensim/models/tfidfmodel.html
tfidf_model = TfidfModel(dictionary=dictionary, smartirs='afc')

In [34]:
%%time
query = '请注意识别,谨防上当受骗!'
query = jieba.lcut(query)
tfidf_query = tfidf_model[dictionary.doc2bow(query)]
similarities = bm25_index[tfidf_query]

best_document = corpus[np.argmax(similarities)]
" ".join(best_document)

CPU times: total: 0 ns
Wall time: 18.4 ms


'该 经验 图片 、 文字 中 可能 存在 外站 链接 或 电话号码 等 , 请 注意 识别 , 谨防 上当受骗 ! 百度 经验 : jingyan . baidu . com 在 之前 win7 的 设置 中 大部分 操作 都 是 在 控制面板 中 完成 的 , 可是 在 新 的 win10 系统 中 却 发现 控制面板 很难 找 了 , 那么 在 哪里 能 找到 呢 ? 让 我 来 告诉 你 吧 ~ 百度 经验 : jingyan . baidu . com 百度 经验 : jingyan . baidu . com1   打开 开始菜单 , 点击 所有 应用   步骤 阅读   2   找到 windows 系统 , 点击 打开 下级菜单   步骤 阅读   3   在 当中 找到 控制面板 点击 打开 即可   步骤 阅读   END 百度 经验 : jingyan . baidu . com'

In [37]:
qrels_dict = {x['qid']: x['pid'] for x in qrels["dev"]}
pid_array = np.array([x['id'] for x in data['corpus']])


In [43]:
# 2. 直接导入rank_bm25模型
from rank_bm25 import BM25Okapi
tokenized_corpus = [jieba.lcut(x["text"]) for x in data["corpus"]]
bm25 = BM25Okapi(tokenized_corpus)

# tfidf_model=TfidfModel(dictionary=dictionary,smartirs='afc')

In [44]:
topn = 30
query_ndcg_score = []
for query_data in tqdm(data['queries']):
    query = jieba.lcut(query_data['text'])
    query_qid = query_data['id']
    query_pids = qrels_dict[query_qid]
    
    # tfidf_query = tfidf_model[dictionary.doc2bow(query)]
    similarities = np.array(bm25.get_scores(query))
    # similarities=bm25_index[tfidf_query]
    top_results = similarities.argsort()[::-1][:topn]
    top_results = [data['corpus'][int(x)]['id'] for x in top_results]
    true_relevance  = [[x in query_pids for x in top_results]]
    scores = [list(similarities[similarities.argsort()[::-1][:topn]])]

    query_ndcg_score.append(ndcg_score(true_relevance, scores))

100%|██████████| 2000/2000 [03:48<00:00,  8.75it/s]


In [45]:
np.mean(query_ndcg_score), np.std(query_ndcg_score)

(0.3449331376092457, 0.3423558128037693)

# BM25 (rank_bm25)

In [22]:
from rank_bm25 import BM25Okapi

corpus = [
    "Hello there good man!",
    "It is quite windy in London",
    "How is the weather today?"
]

tokenized_corpus = [doc.split(" ") for doc in corpus]

bm25 = BM25Okapi(tokenized_corpus)

query = "windy London"
tokenized_query = query.split(" ")
bm25.get_scores(tokenized_query)

array([0.        , 0.93729472, 0.        ])

In [23]:
corpus = [jieba.lcut(x['text']) for x in data['corpus']]
bm25 = BM25Okapi(corpus)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.539 seconds.
Prefix dict has been built successfully.


In [53]:
topn = 30
query_ndcg_score = []
for query_data in tqdm(data['queries']):
    query = jieba.lcut(query_data['text'])
    query_qid = query_data['id']
    query_pids = qrels[query_qid]
    
    similarities = np.array(bm25.get_scores(query))
    top_results = similarities.argsort()[::-1][:topn]
    top_results = [data['corpus'][int(x)]['id'] for x in top_results]
    true_relevance  = [[x in query_pids for x in top_results]]
    scores = [list(similarities[similarities.argsort()[::-1][:topn]])]

    query_ndcg_score.append(ndcg_score(true_relevance, scores))

100%|██████████| 2000/2000 [06:26<00:00,  5.17it/s]


In [54]:
np.mean(query_ndcg_score), np.std(query_ndcg_score)

(0.7077681714845733, 0.3301973263950749)

# M3E

In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('./hugging-face-model/moka-ai/m3e-small/')

#Our sentences we like to encode
sentences = [
    '* Moka 此文本嵌入模型由 MokaAI 训练并开源，训练脚本使用 uniem',
    '* Massive 此文本嵌入模型通过**千万级**的中文句对数据集进行训练',
    '* Mixed 此文本嵌入模型支持中英双语的同质文本相似度计算，异质文本检索等功能，未来还会支持代码检索，ALL in one'
]

#Sentences are encoded by calling model.encode()
embeddings = model.encode(sentences)

#Print the embeddings
for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding.shape)
    print("")

Sentence: * Moka 此文本嵌入模型由 MokaAI 训练并开源，训练脚本使用 uniem
Embedding: (512,)

Sentence: * Massive 此文本嵌入模型通过**千万级**的中文句对数据集进行训练
Embedding: (512,)

Sentence: * Mixed 此文本嵌入模型支持中英双语的同质文本相似度计算，异质文本检索等功能，未来还会支持代码检索，ALL in one
Embedding: (512,)



In [8]:
corpus = [x['text'] for x in data['corpus']][:]
embeddings = model.encode(corpus, normalize_embeddings=True)

In [16]:
topn = 30
query_ndcg_score = []
for query_data in tqdm(data['queries']):
    query_feat = model.encode(query_data['text'], normalize_embeddings=True)
    query_qid = query_data['id']
    query_pids = qrels[query_qid]
    
    similarities = query_feat.dot(embeddings.T)
    top_results = similarities.argsort()[::-1][:topn]
    top_results = [data['corpus'][int(x)]['id'] for x in top_results]
    true_relevance  = [[x in query_pids for x in top_results]]
    scores = [list(similarities[similarities.argsort()[::-1][:topn]])]

    query_ndcg_score.append(ndcg_score(true_relevance, scores))

100%|██████████| 2000/2000 [01:32<00:00, 21.68it/s]


In [17]:
np.mean(query_ndcg_score), np.std(query_ndcg_score)

(0.7614511933399406, 0.27839499502211373)

# BGE

In [18]:
from sentence_transformers import SentenceTransformer
sentences_1 = ["样例数据-1", "样例数据-2"]
sentences_2 = ["样例数据-3", "样例数据-4"]
model = SentenceTransformer('./hugging-face-model/BAAI/bge-small-zh-v1.5/')
embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)
embeddings_2 = model.encode(sentences_2, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

[[0.86921406 0.8665448 ]
 [0.899487   0.8784326 ]]


In [19]:
corpus = [x['text'] for x in data['corpus']][:]
embeddings = model.encode(corpus, normalize_embeddings=True)

In [20]:
topn = 30
query_ndcg_score = []
for query_data in tqdm(data['queries']):
    query_feat = model.encode(query_data['text'], normalize_embeddings=True)
    query_qid = query_data['id']
    query_pids = qrels[query_qid]
    
    similarities = query_feat.dot(embeddings.T)
    top_results = similarities.argsort()[::-1][:topn]
    top_results = [data['corpus'][int(x)]['id'] for x in top_results]
    true_relevance  = [[x in query_pids for x in top_results]]
    scores = [list(similarities[similarities.argsort()[::-1][:topn]])]

    query_ndcg_score.append(ndcg_score(true_relevance, scores))

100%|██████████| 2000/2000 [01:33<00:00, 21.35it/s]


In [21]:
np.mean(query_ndcg_score), np.std(query_ndcg_score)

(0.8629532887126161, 0.221727574464936)

# BGE with ReRank

In [22]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('./hugging-face-model/BAAI/bge-reranker-base/')
rerank_model = AutoModelForSequenceClassification.from_pretrained('./hugging-face-model/BAAI/bge-reranker-base/')
rerank_model.cuda()
rerank_model.eval()

pairs = [['我很开心', '我很沮丧'], ['我很开心', '我很快乐']]
with torch.no_grad():
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    inputs = {key: inputs[key].cuda() for key in inputs.keys()}
    scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()
    print(scores)

tensor([-6.0319,  8.9878], device='cuda:0')


In [29]:
topn = 30
query_ndcg_score = []
for query_data in tqdm(data['queries']):
    query_feat = model.encode(query_data['text'], normalize_embeddings=True)
    query_qid = query_data['id']
    query_pids = qrels[query_qid]
    
    similarities = query_feat.dot(embeddings.T)
    top_results = similarities.argsort()[::-1][:topn]

    pairs = []
    for xi in top_results:
        pairs.append([query_data['text'], corpus[xi]])

    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    with torch.no_grad():
        inputs = {key: inputs[key].cuda() for key in inputs.keys()}
        scores = rerank_model(**inputs, return_dict=True).logits.view(-1, ).float()

    top_results = np.array(top_results)
    rerank_result = top_results[scores.cpu().data.numpy().argsort()[::-1]]
    
    top_results = [data['corpus'][int(x)]['id'] for x in rerank_result]
    true_relevance  = [[x in query_pids for x in top_results]]
    scores = [list(similarities[similarities.argsort()[::-1][:topn]])]

    query_ndcg_score.append(ndcg_score(true_relevance, scores))

100%|██████████| 2000/2000 [27:28<00:00,  1.21it/s]


In [30]:
np.mean(query_ndcg_score), np.std(query_ndcg_score)

(0.9160629142553329, 0.19732814717552222)

# BCEmbedding（有道）

In [31]:
from sentence_transformers import SentenceTransformer

# list of sentences
sentences = ['sentence_0', 'sentence_1', ...]

# init embedding model
model = SentenceTransformer("./hugging-face-model/maidalun1020/bce-embedding-base_v1")

# set max_length to 512 to avoid an error.
model.max_seq_length = 512

# extract embeddings
embeddings = model.encode(sentences, normalize_embeddings=True)

In [32]:
corpus = [x['text'] for x in data['corpus']][:]
embeddings = model.encode(corpus, normalize_embeddings=True)

In [33]:
topn = 30
query_ndcg_score = []
for query_data in tqdm(data['queries']):
    query_feat = model.encode(query_data['text'], normalize_embeddings=True)
    query_qid = query_data['id']
    query_pids = qrels[query_qid]
    
    similarities = query_feat.dot(embeddings.T)
    top_results = similarities.argsort()[::-1][:topn]
    top_results = [data['corpus'][int(x)]['id'] for x in top_results]
    true_relevance  = [[x in query_pids for x in top_results]]
    scores = [list(similarities[similarities.argsort()[::-1][:topn]])]

    query_ndcg_score.append(ndcg_score(true_relevance, scores))

100%|██████████| 2000/2000 [02:06<00:00, 15.76it/s]


In [34]:
np.mean(query_ndcg_score), np.std(query_ndcg_score)

(0.8436841563775636, 0.22071295950607342)

# BCEmbedding with ReRank

In [35]:
from sentence_transformers import CrossEncoder

# init reranker model
rerank_model = CrossEncoder('./hugging-face-model/maidalun1020/bce-reranker-base_v1', max_length=512)

pairs = [['我很开心', '我很沮丧'], ['我很开心', '我很快乐']]
scores = rerank_model.predict(pairs)

In [36]:
query_feats = model.encode([x['text'] for x in data['queries']], normalize_embeddings=True)

In [37]:
query_feat.dot(embeddings.T)

array([0.17447105, 0.19908555, 0.15706639, ..., 0.25002584, 0.2793538 ,
       0.28668842], dtype=float32)

In [40]:
topn = 30
query_ndcg_score = []
for query_data in tqdm(data['queries']):
    query_feat = model.encode(query_data['text'], normalize_embeddings=True)
    query_qid = query_data['id']
    query_pids = qrels[query_qid]
    
    similarities = query_feat.dot(embeddings.T)
    top_results = similarities.argsort()[::-1][:topn]

    pairs = []
    for xi in top_results:
        pairs.append([query_data['text'], corpus[xi]])

    with torch.no_grad():
        scores = rerank_model.predict(pairs)

    top_results = np.array(top_results)
    rerank_result = top_results[scores.argsort()[::-1]]
    
    top_results = [data['corpus'][int(x)]['id'] for x in rerank_result]
    true_relevance  = [[x in query_pids for x in top_results]]
    scores = [list(similarities[similarities.argsort()[::-1][:topn]])]

    query_ndcg_score.append(ndcg_score(true_relevance, scores))

100%|██████████| 2000/2000 [27:58<00:00,  1.19it/s]


In [57]:
np.mean(query_ndcg_score), np.std(query_ndcg_score)

(0.46059702380952383, 0.3968857660785824)

# GTE

In [58]:
from sentence_transformers import SentenceTransformer
sentences_1 = ["样例数据-1", "样例数据-2"]
sentences_2 = ["样例数据-3", "样例数据-4"]
model = SentenceTransformer('./hugging-face-model/thenlper/gte-small-zh')
embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)
embeddings_2 = model.encode(sentences_2, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

[[0.962006   0.95747596]
 [0.95021254 0.94485617]]


In [59]:
corpus = [x['text'] for x in data['corpus']][:]
embeddings = model.encode(corpus, normalize_embeddings=True)

In [61]:
topn = 30
query_ndcg_score = []
for query_data in tqdm(data['queries']):
    query_feat = model.encode(query_data['text'], normalize_embeddings=True)
    query_qid = query_data['id']
    query_pids = qrels[query_qid]
    
    similarities = query_feat.dot(embeddings.T)
    top_results = similarities.argsort()[::-1][:topn]
    top_results = [data['corpus'][int(x)]['id'] for x in top_results]
    true_relevance  = [[x in query_pids for x in top_results]]
    scores = [list(similarities[similarities.argsort()[::-1][:topn]])]

    query_ndcg_score.append(ndcg_score(true_relevance, scores))

100%|██████████| 2000/2000 [01:58<00:00, 16.90it/s]


In [41]:
np.mean(query_ndcg_score), np.std(query_ndcg_score)

(0.8563148126485296, 0.21721973465173638)