# BM25

In [39]:
import os
import sys
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [42]:
def word_cloud(text: list):
  """text: list of text documents/sentences"""
  text = ' '.join(str(text).lower())
  wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
  plt.figure(figsize=(10, 5))
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis('off')
  plt.show()


In [5]:
data = [
    "The sun set over the horizon, casting a warm glow across the fields.",
    "She carefully placed the delicate vase on the table, admiring its intricate design.",
    "The sound of the rain tapping on the window created a soothing atmosphere.",
    "He opened the old book and was immediately transported to a different time and place.",
    "The children laughed and played in the park, their joy infectious to everyone around.",
]
# 
data

['The sun set over the horizon, casting a warm glow across the fields.',
 'She carefully placed the delicate vase on the table, admiring its intricate design.',
 'The sound of the rain tapping on the window created a soothing atmosphere.',
 'He opened the old book and was immediately transported to a different time and place.',
 'The children laughed and played in the park, their joy infectious to everyone around.']

In [41]:
' '.join(data)

'The sun set over the horizon, casting a warm glow across the fields. She carefully placed the delicate vase on the table, admiring its intricate design. The sound of the rain tapping on the window created a soothing atmosphere. He opened the old book and was immediately transported to a different time and place. The children laughed and played in the park, their joy infectious to everyone around.'

In [8]:
min([len(i.split(' ')) for i in data])

13

In [9]:
max([len(i.split(' ')) for i in data])

15

In [20]:
class BM25():
  def __init__(self, data, freq_sat=0.2, len_norm=2) -> None:
    self.data = data
    self.freq_sat = freq_sat
    self.len_norm = len_norm
    self.normalise_data()
    
  def normalise_data(self): self.data = [ str(doc).lower() for doc in self.data ]

  def TF(self, t, d):
    f = self.freq_term_document(t, d)
    n = f * self.freq_saturation()
    dd = f + (self.freq_sat * ( 1 - self.len_norm + self.len_norm * ( self.len_doc(d) / self.avg_doc_length() )))
    return n / dd
  
  def freq_saturation(self,): return self.freq_sat + 1
  
  def len_doc(self, d): return len( d.split(" ") )
  
  def avg_doc_length(self,): 
    return sum( [ len(d.split(" ")) for d in self.data ] ) / self.__len__()
  
  def freq_term_document(self, t, d): return d.split(" ").count(t)
  
  def IDF(self, t):
    n = self.__len__() - self.numb_doc_with_term(t) + .5
    d = self.numb_doc_with_term(t) + .5
    y = np.log((n/d)+1)
    return y

  def bm25(self):
    """ variant, modified to fit my use case """
    res = []
    for doc in self.data: 
      x = [ self.TF(t, doc) * self.IDF(t) for t in doc.split(" ") ]
      res.append(x)
    return res

  def numb_doc_with_term(self,t):
    c = 0;
    for doc in self.data: 
      if t in doc: c += 1
    return c
  
  def len_document(self, document): 
    return len(document.split(" "))
  
  def __len__(self): return len(self.data)

In [38]:
bm23 = BM25(data)
x = bm23.bm25()

for i in x:
  print(i)

[0.09843061500490456, 1.4069853217336203, 1.4069853217336203, 1.4069853217336203, 0.09843061500490456, 1.4069853217336203, 1.4069853217336203, 0.08831005425813164, 1.4069853217336203, 1.4069853217336203, 1.4069853217336203, 0.09843061500490456, 1.4069853217336203]
[1.4069853217336203, 1.4069853217336203, 1.4069853217336203, 0.09568906148724775, 1.4069853217336203, 1.4069853217336203, 0.2919758347271806, 0.09568906148724775, 1.4069853217336203, 1.4069853217336203, 1.4069853217336203, 1.4069853217336203, 1.4069853217336203]
[0.09843061500490456, 1.4069853217336203, 1.4069853217336203, 0.09843061500490456, 1.4069853217336203, 1.4069853217336203, 0.2919758347271806, 0.09843061500490456, 1.4069853217336203, 1.4069853217336203, 0.08831005425813164, 1.4069853217336203, 1.4069853217336203]
[0.08412474362978416, 1.340303552931079, 0.08412474362978416, 1.340303552931079, 1.340303552931079, 0.9375098289774044, 1.340303552931079, 1.340303552931079, 1.340303552931079, 0.8464247508066141, 0.08412474