# BM25

In [14]:
import os
import sys
import math
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [2]:
def word_cloud(text: list):
  """text: list of text documents/sentences"""
  text = ' '.join(str(text).lower())
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
  plt.figure(figsize=(10, 5))
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis('off')
  plt.show()


In [3]:
data = [
    "The sun set over the horizon, casting a warm glow across the fields.",
    "She carefully placed the delicate vase on the table, admiring its intricate design.",
    "The sound of the rain tapping on the window created a soothing atmosphere.",
    "He opened the old book and was immediately transported to a different time and place.",
    "The children laughed and played in the park, their joy infectious to everyone around.",
]

data

['The sun set over the horizon, casting a warm glow across the fields.',
 'She carefully placed the delicate vase on the table, admiring its intricate design.',
 'The sound of the rain tapping on the window created a soothing atmosphere.',
 'He opened the old book and was immediately transported to a different time and place.',
 'The children laughed and played in the park, their joy infectious to everyone around.']

In [65]:
mi = min([len(i.split(' ')) for i in data])
mx = max([len(i.split(' ')) for i in data])
avg = round(sum( [len(d.split(' ')) for d in data] ) / len(data))
avg_max = math.ceil((avg + mx) / 2)
mi, mx, avg, avg_max

(13, 15, 14, 15)

In [57]:
print([ len(d.split()) for d in data])
tl = []
tlen = avg
print("tl", tlen)
sp = "aaaa"
for d in data:
  d = ' '.join(d.split()).lower().split(' ')
  if len(d) < tlen:
    print("l", len(d),  )
    d.extend( [ sp for _ in range(len(d), tlen)])
  else:
    print("g", len(d[: tlen ]))
    d = d[: tlen]
  tl.append(d)

# print(tl)
[ len(d) for d in tl]

[13, 13, 13, 15, 14]
tl 14
l 13
l 13
l 13
g 14
g 14


[14, 14, 14, 14, 14]

1.1920929e-07

In [85]:
class BM25():
  """
  data: list of strings
  token_len: avg | min | max | avg_max[avg of avg and max]
  freq_sat - frequency_saturation : float
  len-norm - length of normalisation : float
  token_len - tokens length per document ot have same length in all docs
  """
  def __init__(self, data: list, freq_sat=0.2, len_norm=2, tokens_len='avg'):
    self.data = data
    self.freq_sat = freq_sat
    self.len_norm = len_norm
    self.tokens_len = tokens_len.lower()
    # used for filling short documents to make documents have equal lengths
    self.special_token: float = np.finfo(np.float32).eps # sys.float_info.epsilon # use the
    self.normalise_data()
  
  def normalise_data(self): self.data = [ str(doc).lower() for doc in self.data ]

  def TF(self, t, d):
    f = self.freq_term_document(t, d)
    n = f * self.freq_saturation()
    dd = f + (self.freq_sat * ( 1 - self.len_norm + self.len_norm * ( self.len_doc(d) / self.avg_doc_length() )))
    return n / dd
  
  def freq_saturation(self,): return self.freq_sat + 1
  
  def len_doc(self, d): return len( d.split(" ") )
  
  def avg_doc_length(self,): 
    return sum( [ len(d.split(" ")) for d in self.data ] ) / self.__len__()
  
  def freq_term_document(self, t, d): return d.split(" ").count(t)
  
  def IDF(self, t):
    n = self.__len__() - self.numb_doc_with_term(t) + .5
    d = self.numb_doc_with_term(t) + .5
    y = np.log((n/d)+1)
    return y

  def transform(self):
    """ variant, modified to fit my use case """
    res = []
    for doc in self.data: 
      x = [ self.TF(t, doc) * self.IDF(t) for t in doc.split(" ") ]
      res.append(x)
    res = self._token_length(res)
    return res

  def numb_doc_with_term(self, t):
    c = 0;
    for doc in self.data: 
      if t in doc: c += 1
    return c
  
  def len_document(self, d): 
    return len(d.split(" "))
  
  def __len__(self): return len(self.data)
  
  def _token_len_min(self):
    return min([len(i.split(' ')) for i in self.data])
  
  def _token_len_max(self):
    return max([len(i.split(' ')) for i in self.data])
  
  def _token_len_avg(self):
    return round(sum( [len(d.split(' ')) for d in self.data] ) / len(self.data))
  
  def _vocabulary(self):
    '''
    - store unique words from the entire document
    - store vocab using their bm25, so that its can be easily to map back from bm25 number to word used
    '''
    pass
  
  def _vocabulary_length(self):
    '''get the length of unique words from the data corpus'''
    x = ' '.join(self.data)
    x = ' '.join(x.split()).lower().split(' ')
    return len(set(x))
    
  def _token_length(self, embs ):
    tl = []
    tlen = 0
    # 
    if self.tokens_len == 'min': tlen = self._token_len_min()
    elif self.tokens_len == 'max': tlen = self._token_len_max()
    elif self.tokens_len == 'avg_max': tlen = math.ceil( (self._token_len_max() + self._token_len_avg() ) / 2 )
    else : tlen = self._token_len_avg()
    # 
    for d in embs:
      if len(d) < tlen: d.extend( [ self.special_token for _ in range(len(d), tlen)])
      else: d = d[: tlen]
      tl.append(d)
      
    return tl

In [86]:
bm23 = BM25(data)
x = bm23.transform()

# for i in x: print(i)
[len(i) for i in x]

[14, 14, 14, 14, 14]