<a href="https://colab.research.google.com/github/jankovicsandras/bm25opt/blob/main/bm25opt_comparative_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BM25opt comparative test
More info:

https://github.com/jankovicsandras/bm25opt

https://github.com/dorianbrown/rank_bm25

https://en.wikipedia.org/wiki/Okapi_BM25


## Test dataset: Wordpress related QA from Huggingface

In [1]:

! wget https://huggingface.co/datasets/mteb/cqadupstack-wordpress/resolve/main/corpus.jsonl
! wget https://huggingface.co/datasets/mteb/cqadupstack-wordpress/resolve/main/queries.jsonl
! ls -la

import random, json


# load from jsonl
wcorpus = []
with open('corpus.jsonl') as f:
  wcstr = f.read()
  wcorpus = wcstr.split('\n')
print('len(wcorpus)',len(wcorpus))

# create sampledwcorpus, corpus, questions
sampledwcorpus = random.sample(wcorpus,1000)
corpus = []
qqs = []
for i in range(0,len(sampledwcorpus)) :
  wjs = json.loads(sampledwcorpus[i])
  #print(i,'---------------',wjs['_id'])
  #print(len(wjs['title']),wjs['title'])
  #print(len(wjs['text']),wjs['text'])
  corpus.append( wjs['text'] )
  qqs.append( [wjs['title'],i] )

# questions and solutions
random.shuffle(qqs)
questions = [ q[0] for q in qqs ]
questionsolutions = [ q[1]+1 for q in qqs ]


--2024-10-24 09:31:50--  https://huggingface.co/datasets/mteb/cqadupstack-wordpress/resolve/main/corpus.jsonl
Resolving huggingface.co (huggingface.co)... 18.154.227.7, 18.154.227.87, 18.154.227.67, ...
Connecting to huggingface.co (huggingface.co)|18.154.227.7|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.hf.co/repos/e3/a1/e3a12a6c68820b63bcd7ae09aa898026eba332004f38239070204b3b146060bc/089b11077372513eca8cc16653485aff1f232f8be18d8c6263be4b3b2bda0078?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27corpus.jsonl%3B+filename%3D%22corpus.jsonl%22%3B&Expires=1730021511&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczMDAyMTUxMX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2UzL2ExL2UzYTEyYTZjNjg4MjBiNjNiY2Q3YWUwOWFhODk4MDI2ZWJhMzMyMDA0ZjM4MjM5MDcwMjA0YjNiMTQ2MDYwYmMvMDg5YjExMDc3MzcyNTEzZWNhOGNjMTY2NTM0ODVhZmYxZjIzMmY4YmUxOGQ4YzYyNjNiZTRiM2IyYmRhMDA3OD9yZXNwb25zZS1jb

## comparative testing of BM25opt and rank_bm25
#### the order of results and the BM25 scores should be equal

In [4]:



! wget https://raw.githubusercontent.com/jankovicsandras/bm25opt/refs/heads/main/BM25opt.py
! pip install rank_bm25

from BM25opt import BM25opt, tokenizer_default
from rank_bm25 import BM25Okapi, BM25L, BM25Plus
import json, time, math

# top results number
k = 10

# preparing tokenized corpus
tokenized_corpus = [ tokenizer_default(document) for document in corpus ]

# index building
rank_bm25_index = BM25Okapi( tokenized_corpus )
bm25opt_index = BM25opt( corpus, algo='okapi' )

# timers
t1 = 0
t2 = 0

# Running the questions
runquestions = True # TODO
if runquestions :
  for qi,q in enumerate(questions) :

    # tokenize and print question
    tokenizedquestion = tokenizer_default(q)
    if qi % 100 == 0 :
      print('\n----Question',qi,':',q,' | Tokenized: ',tokenizedquestion)
      if questionsolutions and qi<len(questionsolutions) :
        print('Solution ID:',questionsolutions[qi])

    # rank_bm25 BM25 search
    dt1 = time.time()
    doc_scores = rank_bm25_index.get_scores( tokenizedquestion )
    t1 += (time.time()-dt1)
    bres = [ [i,s] for i,s in enumerate(doc_scores) ]
    bres.sort(key=lambda x:x[1],reverse=True)
    bres = bres[:k]

    # BM25opt BM25 search
    dt2 = time.time()
    doc_scores2 = bm25opt_index.get_scores( q )
    t2 += (time.time()-dt2)
    bres2 = [ [i,s] for i,s in enumerate(doc_scores2) ]
    bres2.sort(key=lambda x:x[1],reverse=True)
    bres2 = bres2[:k]

    # Print rank_bm25, BM25opt results
    if qi % 100 == 0 :
      for i in range(0,k):
        print( '|rank_bm25|', bres[i][0]+1, '|',  math.floor(bres[i][1]*10e5 )/10e5, '|', corpus[bres[i][0] ] )
        print( '|BM25opt  |', bres2[i][0]+1, '|', math.floor(bres2[i][1]*10e5)/10e5, '|', corpus[bres2[i][0]],'\n' )

print('rank_bm25 time:',t1,' | BM25opt time:',t2,' | BM25opt is',t1/t2,'times faster.')

--2024-10-24 09:33:33--  https://raw.githubusercontent.com/jankovicsandras/bm25opt/refs/heads/main/BM25opt.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6279 (6.1K) [text/plain]
Saving to: ‘BM25opt.py.2’


2024-10-24 09:33:34 (70.1 MB/s) - ‘BM25opt.py.2’ saved [6279/6279]


----Question 0 : is_home doesn't affect content  | Tokenized:  ['is_home', "doesn't", 'affect', 'content']
Solution ID: 68
|rank_bm25| 68 | 8.79234 | I have a bunch of html i need to show only if it is a homepage. My theme pulls content from AJAX and then changes the URL to reflect the new content change. in the meantime, the homepage content (the entirety of which is contained in the footer) still remains below, what i need to happen is the footer content should still be there but t