<a href="https://colab.research.google.com/github/jankovicsandras/plpgsql_bm25/blob/main/plpgsql_bm25_comparison_with_paradedb_pg_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## installing PostgreSQL

In [None]:
! sudo apt install gnupg2 wget nano
! sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
! curl -fsSL https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo gpg --dearmor -o /etc/apt/trusted.gpg.d/postgresql.gpg
! sudo apt update
! sudo apt install postgresql-16 postgresql-contrib-16 postgresql-server-dev-16
! service postgresql start
! sudo -u postgres psql -c "CREATE USER root WITH SUPERUSER"

## installing ParadeDB pg_search

In [None]:
! export PARADEDB_TELEMETRY=false
! lsb_release -a
! sudo apt-get install -y libicu70

# https://github.com/paradedb/paradedb/releases
! wget https://github.com/paradedb/paradedb/releases/download/v0.10.2/postgresql-16-pg-search_0.10.2-1PARADEDB-jammy_amd64.deb
! dpkg -i postgresql-16-pg-search_0.10.2-1PARADEDB-jammy_amd64.deb

# https://docs.paradedb.com/deploy/pg_search
# https://github.com/paradedb/paradedb/tree/dev/pg_search

! echo "shared_preload_libraries = 'pg_search'" >> /etc/postgresql/16/main/postgresql.conf
! service postgresql stop
! sleep 10
! service postgresql start
! sudo -u postgres psql -c "CREATE EXTENSION pg_search;"

## init psycopg2 helper

In [3]:
! pip install psycopg2

import psycopg2


def msq( t, verbose=False ) :
  res = []
  with psycopg2.connect("dbname=postgres user=root") as conn:
    with conn.cursor() as cur:
      try :
        cur.execute(t)
        res = cur.fetchall()
        if verbose :
          for cdesc in cur.description :
            print(cdesc[0],'|',end='')
          print('')
          for r in res:
            for c in r:
              print(c,'|',end='')
            print('')
      except Exception as ex :
        print('!!!! msq() ERROR ',ex,'|',t,'|')
  return res


def iscorrectintopk(correct, res) :
  cr = correct.replace('"','\'')
  if len(res) < 1 : return 0
  for cell in res[0] :
    if str(cell).replace('"','\'') == cr : return 2
  for ri in range(1,len(res)) :
    for cell in res[ri] :
      if str(cell).replace('"','\'') == cr : return 1
  return 0




## Test dataset: Wordpress related QA from Huggingface

In [None]:

! wget https://huggingface.co/datasets/mteb/cqadupstack-wordpress/resolve/main/corpus.jsonl
! wget https://huggingface.co/datasets/mteb/cqadupstack-wordpress/resolve/main/queries.jsonl
! ls -la

import random, json

samplenum = 100

# load from jsonl
wcorpus = []
with open('corpus.jsonl') as f:
  wcstr = f.read()
  wcorpus = wcstr.split('\n')
print('len(wcorpus)',len(wcorpus))

# create sampled corpus, items, questions
sampledwcorpus = random.sample( wcorpus, samplenum )
items = []
qqs = []
for i in range(0,len(sampledwcorpus)) :
  wjs = json.loads(sampledwcorpus[i])
  #print(i,'---------------',wjs['_id'])
  #print(len(wjs['title']),wjs['title'])
  #print(len(wjs['text']),wjs['text'])
  items.append( { 'doctext': wjs['text'] } )
  qqs.append( [ wjs['title'], items[-1]['doctext'] ] )

# questions and solutions
random.shuffle(qqs)
questions = [ q[0] for q in qqs ]
questionsolutions = [ q[1] for q in qqs ]

# export to CSV for Postgres
csvfilename = 'items.csv'
with open(csvfilename,'w+') as f:
  f.write('id;doctext\n')
  for i in range(0,len(items)) :
    f.write(str((i+1))+';"'+items[i]['doctext'].replace('"','\'')+'"\n')


# Postgres creating items table by importing from CSV
msq('DROP TABLE IF EXISTS items CASCADE;')
msq('CREATE TABLE items (id SERIAL, doctext TEXT);')
msq('COPY items FROM \'/content/items.csv\' DELIMITER \';\' CSV HEADER;')
#msq('SELECT * from items;')


## mybm25okapi: a refactored variant of rank_bm25 Okapi

In [5]:
"""
This is a refactored variant of rank_bm25 Okapi. https://github.com/dorianbrown/rank_bm25
This is not required for plpgsql_bm25, just here for comparison / testing.

Usage:
  - corpus and query must be tokenized already, e.g. corpus = [ ['one','two','three'], ['bla','two','two'] ]  ; query = [ 'Is', 'this', 'a', 'question?' ] ; see also mytokenize()
  - __init__(corpus) will initialize the bm25Okapi components, where self.wsmap is the most important
  - No update is possible, so if the documents change in the corpus, then __init__(corpus) must be called again (recreating all the components).
  - search with topk() or get_scores()
"""
import math


class mybm25okapi:
  def __init__(self, corpus):
    # constants
    self.debugmode = False
    self.k1 = 1.5
    self.b = 0.75
    self.epsilon = 0.25

    self.corpus_len = len(corpus)
    self.avg_doc_len = 0
    self.word_freqs = []
    self.idf = {}
    self.doc_lens = []
    word_docs_count = {}  # word -> number of documents with word
    total_word_count = 0

    for document in corpus:
      # doc lengths and total word count
      self.doc_lens.append(len(document))
      total_word_count += len(document)

      # word frequencies in this document
      frequencies = {}
      for word in document:
        if word not in frequencies:
          frequencies[word] = 0
        frequencies[word] += 1
      self.word_freqs.append(frequencies)

      # number of documents with word count
      for word, freq in frequencies.items():
        try:
          word_docs_count[word] += 1
        except KeyError:
          word_docs_count[word] = 1

    # average document length
    self.avg_doc_len = total_word_count / self.corpus_len

    if self.debugmode : print('self.corpus_len',self.corpus_len,'\nself.doc_lens',self.doc_lens,'\ntotal_word_count',total_word_count,'\nself.word_freqs',self.word_freqs,'\nself.avg_doc_len',self.avg_doc_len,'\nword_docs_count',word_docs_count)

    # precalc "half of divisor" + self.k1 * (1 - self.b + self.b * doc_lens / self.avg_doc_len)
    self.hds = [ self.k1 * ( 1-self.b + self.b*doc_len/self.avg_doc_len) for doc_len in self.doc_lens ]
    if self.debugmode : print('self.hds',self.hds)

    """
    Calculates frequencies of terms in documents and in corpus.
    This algorithm sets a floor on the idf values to eps * average_idf
    """
    # collect idf sum to calculate an average idf for epsilon value
    # collect words with negative idf to set them a special epsilon value.
    # idf can be negative if word is contained in more than half of documents
    idf_sum = 0
    negative_idfs = []
    for word, freq in word_docs_count.items():
      idf = math.log(self.corpus_len - freq + 0.5) - math.log(freq + 0.5)
      self.idf[word] = idf
      idf_sum += idf
      if idf < 0:
        negative_idfs.append(word)
      if self.debugmode : print('word',word,'self.idf[word]',self.idf[word])
    self.average_idf = idf_sum / len(self.idf)
    if self.debugmode : print('idf_sum',idf_sum,'len(self.idf)',len(self.idf),'self.average_idf',self.average_idf)
    # assign epsilon
    eps = self.epsilon * self.average_idf
    if self.debugmode : print('eps',eps)
    for word in negative_idfs:
      self.idf[word] = eps
      if self.debugmode : print('word',word,'got eps',eps)
    if self.debugmode : print('self.idf',self.idf)


    # words * documents score map
    self.wsmap = {}
    for word in self.idf :
      self.wsmap[word] = [0] * self.corpus_len
      word_freqs = [ (word_freq.get(word) or 0) for word_freq in self.word_freqs ]
      thiswordidf = (self.idf.get(word) or 0)
      if self.debugmode : print('word in self.idf',word,'thiswordidf',thiswordidf,'word_freqs',word_freqs)
      for i in range(0,self.corpus_len) :
        self.wsmap[word][i] = thiswordidf * ( word_freqs[i] * (self.k1 + 1) / ( word_freqs[i] + self.hds[i] ) ) # += replaced with =
    if self.debugmode : print('self.wsmap',self.wsmap)


  # get a list of scores for every document
  def get_scores(self, tokenizedquery):
    # zeroes list of scores
    scores = [0] * self.corpus_len
    # for each word in tokenizedquery, if word is in wsmap, lookup and add word score for every documents' scores
    for word in tokenizedquery:
      if word in self.wsmap :
        for i in range(0,self.corpus_len) :
          scores[i] += self.wsmap[word][i]
    # return scores list (not sorted)
    return scores


  def topk(self,tokenizedquery,k=None):
    docscores = self.get_scores( tokenizedquery )
    sisc = [ [i,s] for i,s in enumerate(docscores) ]
    sisc.sort(key=lambda x:x[1],reverse=True)
    if k :
      sisc = sisc[:k]
    return sisc


  # save the words*documents score map as csv for import to Postgres: COPY tablename_bm25wsmap FROM '/path-to/tablename_bm25wsmap.csv' DELIMITER ';' CSV HEADER;
  def exportwsmap(self, csvfilename) :
    with open(csvfilename,'w+') as f:
      f.write('word;vl\n')
      for word in self.wsmap :
        f.write('"'+word.replace('"','\'')+'";{'+str(self.wsmap[word]).strip()[1:-1]+'}\n')




# tokenization function
def mytokenize(s) :
  ltrimchars = ['(','[','{','<','\'','"']
  rtrimchars = ['.', '?', '!', ',', ':', ';', ')', ']', '}', '>','\'','"']
  if type(s) != str : return []
  wl = s.lower().split()
  for i,w in enumerate(wl) :
    if len(w) < 1 : continue
    si = 0
    ei = len(w)
    try :
      while si < ei and w[si] in ltrimchars : si += 1
      while ei > si and w[ei-1] in rtrimchars : ei -= 1
      wl[i] = wl[i][si:ei]
    except Exception as ex:
      print('|',w,'|',ex,'|',wl)
  wl = [ w for w in wl if len(w) > 0 ]
  return wl



## plpgsql_bm25

In [None]:
# loading https://github.com/jankovicsandras/plpgsql_bm25

! wget https://raw.githubusercontent.com/jankovicsandras/plpgsql_bm25/refs/heads/main/plpgsql_bm25.sql

# execute to load the functions
! psql postgresql://@/postgres -f /content/plpgsql_bm25.sql


## running the comparsion of plpgsql_bm25, ParadeDB pg_search, rank_bm25, mybm25okapi

In [14]:

# comparative testing of plpgsql_bm25.sql, mybm25okapi and rank_bm25 BM25Okapi
# the order of results and the bm25 scores should be equal

! pip install rank_bm25

from rank_bm25 import BM25Okapi
import json, subprocess, time
import pandas as pd

# table and file names
tablename = 'items'
columnname = 'doctext'
tablename_bm25wsmap = tablename+'_bm25wsmap'
csvfilepath = '/content/'+tablename_bm25wsmap+'.csv'
k = 5
stats = {
  'plpgsql_bm25_correct':[],
  'paradedbbm25_correct':[],
  'rank_bm25_correct':[],
  'mybm25_correct':[],
  'plpgsql_bm25_speed':[],
  'paradedbbm25_speed':[],
  'rank_bm25_speed':[],
  'mybm25_speed':[],
}
verbose = False

# preparing tokenized corpus
tokenized_corpus = [ mytokenize(item[columnname]) for item in items ]

# rank_bm25 and mybm25okapi
rank_bm25_index = BM25Okapi(tokenized_corpus)
mybm25_index = mybm25okapi(tokenized_corpus)

# Postgres has already "items" table, tokenization is built-in
msq( 'SELECT bm25createindex(\''+tablename+'\',\''+columnname+'\');', verbose )

# ParadeDB pg_search
! psql postgresql://@/postgres -c "DROP SCHEMA IF EXISTS paradedbbm25 CASCADE;"
! psql postgresql://@/postgres -c "CALL paradedb.create_bm25( index_name => 'paradedbbm25', schema_name => 'public', table_name => 'items', key_field => 'id', text_fields => paradedb.field('doctext') );"
paradedbcharfilter = '\'"():-'

# Running the questions
for qi,q in enumerate(questions) :

  # tokenize and print question
  tokenizedquestion = mytokenize(q)
  if verbose :
    print('\n----Question',qi,':',q,' | Tokenized: ',tokenizedquestion)
    if questionsolutions and qi<len(questionsolutions) : print('Solution:',questionsolutions[qi])

  # plpgsql_bm25 BM25 search
  if verbose : print('plpgsql_bm25|')
  starttime = time.time()
  mres = msq( 'SELECT * FROM bm25topk( \''+tablename+'\', \''+columnname+'\', \''+q.replace("'","\'\'")+'\', '+str(k)+' );', verbose )
  stats['plpgsql_bm25_speed'].append( (time.time()-starttime) )
  stats['plpgsql_bm25_correct'].append( iscorrectintopk(questionsolutions[qi], mres) )


  # ParadeDB BM25 search
  if verbose : print('paradedbbm25|')
  # preparing query TODO
  ptokq = 'doctext:'
  for w in tokenizedquestion :
    w2 = ''.join([ c for c in w if c not in paradedbcharfilter])
    if len(w2) > 0 :
      ptokq += w2 + ' OR doctext:'
  ptokq = ptokq[:-12] + ''
  pcmdstr = 'SELECT id, doctext FROM paradedbbm25.search( \''+ptokq+'\', limit_rows => 5 );'
  # executing query
  starttime = time.time()
  mres = msq( pcmdstr, verbose )
  stats['paradedbbm25_speed'].append( (time.time()-starttime) )
  stats['paradedbbm25_correct'].append( iscorrectintopk(questionsolutions[qi], mres) )

  # rank_bm25 BM25 search
  starttime = time.time()
  doc_scores = rank_bm25_index.get_scores( tokenizedquestion )
  stats['rank_bm25_speed'].append( (time.time()-starttime) )
  bres = [ [i,s] for i,s in enumerate(doc_scores) ]
  bres.sort(key=lambda x:x[1],reverse=True)
  bres = bres[:k]
  for i in range(0,len(bres)) :
    bres[i].append( items[ bres[i][0] ][columnname] )
    bres[i][0] += 1
  stats['rank_bm25_correct'].append( iscorrectintopk(questionsolutions[qi], bres) )
  if verbose :
    print('rank_bm25  |')
    for br in bres  : print( br[0], '|', br[1], '|', br[2] ) #, items[ br[0]-1 ][columnname] )

  # mybm25okapi BM25 search
  starttime = time.time()
  bres2 = mybm25_index.topk( tokenizedquestion, k )
  stats['mybm25_speed'].append( (time.time()-starttime) )
  for i in range(0,len(bres2)) :
    bres2[i].append( items[ bres2[i][0] ][columnname] )
    bres2[i][0] += 1
  stats['mybm25_correct'].append( iscorrectintopk(questionsolutions[qi], bres2) )
  if verbose :
    print('mybm25okapi|')
    for br in bres2 : print( br[0], '|', br[1], '|', br[2] ) #items[ br[0]-1 ][columnname] )

  # print stats
  print(qi,'corrects',stats['plpgsql_bm25_correct'][-1],stats['paradedbbm25_correct'][-1],stats['rank_bm25_correct'][-1],stats['mybm25_correct'][-1],'speeds',stats['plpgsql_bm25_speed'][-1],stats['paradedbbm25_speed'][-1],stats['rank_bm25_speed'][-1],stats['mybm25_speed'][-1])


stats2 = {}
for statname in stats :
  stats2[statname+' total'] = [ sum(stats[statname]) ]

if verbose :
  df = pd.DataFrame(stats)
  #df.loc[:,'Total'] = df.sum(axis=1)
  df.style

df = pd.DataFrame(stats2)
df.style


NOTICE:  schema "paradedbbm25" does not exist, skipping
DROP SCHEMA
CALL
0 corrects 2 2 2 2 speeds 0.05577516555786133 0.042365074157714844 0.0008399486541748047 0.00026702880859375
1 corrects 2 2 2 2 speeds 0.06937026977539062 0.045545339584350586 0.001058816909790039 0.0003635883331298828
2 corrects 0 1 0 0 speeds 0.06086874008178711 0.04171395301818848 0.0005342960357666016 0.0001575946807861328
3 corrects 2 2 2 2 speeds 0.06114816665649414 0.05587053298950195 0.0007109642028808594 0.0002200603485107422
4 corrects 1 1 1 1 speeds 0.06101727485656738 0.04585742950439453 0.0007686614990234375 0.00021958351135253906
5 corrects 2 2 2 2 speeds 0.07250547409057617 0.05018949508666992 0.0006940364837646484 0.0002536773681640625
6 corrects 0 2 0 0 speeds 0.06806111335754395 0.043276071548461914 0.0004277229309082031 0.0001125335693359375
7 corrects 2 2 2 2 speeds 0.0596013069152832 0.04267692565917969 0.0008728504180908203 0.00025010108947753906
8 corrects 2 2 2 2 speeds 0.06173419952392578 

Unnamed: 0,plpgsql_bm25_correct total,paradedbbm25_correct total,rank_bm25_correct total,mybm25_correct total,plpgsql_bm25_speed total,paradedbbm25_speed total,rank_bm25_speed total,mybm25_speed total
0,175,184,175,175,8.02774,6.151216,0.082607,0.03326
