<a href="https://colab.research.google.com/github/juliano-soares/2017-primeiro-semestre/blob/master/BM25.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import math
import os
import string

from collections import Counter
from typing import List, Tuple


class BM25:
    def __init__(self, documents: List[str]):
        self.documents = documents
        self.total_docs = len(documents)
        self.avg_doc_len = sum([len(doc) for doc in documents]) / self.total_docs
        self.k1 = 1.2
        self.b = 0.75
        self.idf = self.compute_idf()
        self.document_scores = []

    def compute_idf(self) -> dict:
        word_doc_freq = {}
        for document in self.documents:
            words = set(document.split())
            for word in words:
                word_doc_freq[word] = word_doc_freq.get(word, 0) + 1
        return {word: math.log((self.total_docs - freq + 0.5) / (freq + 0.5)) for word, freq in word_doc_freq.items()}

    def score(self, query: str, document_index: int) -> float:
        query_words = Counter(query.split())
        score = 0
        for word, count in query_words.items():
            if word not in self.idf:
                continue
            term_frequency = self.documents[document_index].count(word)
            numerator = self.idf[word] * term_frequency * (self.k1 + 1)
            denominator = term_frequency + self.k1 * (1 - self.b + self.b * (len(self.documents[document_index]) / self.avg_doc_len))
            score += numerator / denominator
        return score

    def rank(self, query: str, n_results: int = 10) -> List[Tuple[int, float]]:
        self.document_scores = []
        for i in range(self.total_docs):
            self.document_scores.append((i, self.score(query, i)))
        self.document_scores.sort(key=lambda x: x[1], reverse=True)
        return self.document_scores[:n_results]


In [None]:
def preprocess_document(doc: str) -> str:
    """
    Removes punctuations and lowercases the given document
    """
    return doc.translate(str.maketrans('', '', string.punctuation)).lower()

In [None]:
def load_documents(file_path: str) -> List[str]:
    """
    Loads the documents from the given file path and preprocesses them
    """
    with open(file_path, 'r') as f:
        documents = f.readlines()
        documents = [preprocess_document(doc) for doc in documents]
    return documents

Class main to load the files and instantiate as a class.
The queries variable is a list of strings with the search you want to do.

The returned result and the ID of the top 5 documents based on the entered query.

In [None]:
if __name__ == '__main__':
    # Loading documents from file
    file_path = 'cisi/CISI.ALL'
    documents = load_documents(file_path)

    # Initializing the BM25 model
    bm25 = BM25(documents)

    # Evaluating the model with sample queries
    queries = ['circuit design', 'information retrieval']
    for query in queries:
        results = bm25.rank(query, n_results=5)
        print(f"Top 5 results for query '{query}':")
        for result in results:
            print(f"\tDocument {result[0]} with score {result[1]}")