In [2]:
# Importando as bibliotecas
import os
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [3]:
# Funções

def load_cisi(corpus_path, query_path):
    """Carrega o corpus e as consultas da coleção CISI"""

    with open(corpus_path, 'r') as f:
        corpus = f.read()

    queries = pd.read_csv(query_path, sep='\n', header=None)
    queries = queries[queries.index % 2 == 0].reset_index(drop=True)
    queries.columns = ['query']
    queries['query'] = queries['query'].str.replace('^ *(\d+)', '')
    queries = queries.reset_index(drop=True)
 
      
    return corpus, queries


def preprocess_text(text):
    """Realiza preprocessamento de um documento"""

    return text.lower()


def compute_idf(corpus):
    """Calcula a frequência inversa do documento (IDF) de cada termo no corpus"""

    count_vectorizer = CountVectorizer(preprocessor=preprocess_text, token_pattern=r'\b\w+\b')
    term_counts = count_vectorizer.fit_transform([corpus])
    idf = np.log(len(term_counts.toarray()) / np.sum(term_counts.toarray() > 0, axis=0))
    idf = np.asarray(idf).squeeze()

    return idf


def compute_bm25_weights(corpus, queries, k1=1.2, b=0.75):
    """Calcula os pesos BM25 para o corpus e as consultas"""

    # Transforma o corpus em uma matriz termo-documento
    count_vectorizer = CountVectorizer(preprocessor=preprocess_text, token_pattern=r'\b\w+\b')
    term_counts = count_vectorizer.fit_transform([corpus])

    # Calcula a frequência inversa do documento (IDF) de cada termo no corpus
    idf = compute_idf(corpus)

    # Transforma a matriz termo-documento em uma matriz TF-IDF
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(term_counts)

    # Calcula os pesos BM25 para cada consulta
    weights = []
    for query in queries:
        # Transforma a consulta em um vetor de termos
        query_vec = count_vectorizer.transform([query])
        query_vec = np.squeeze(np.asarray(query_vec.sum(axis=0)))

        # Calcula o denominador da fórmula BM25
        K = k1 * ((1 - b) + b * (len(term_counts.toarray()[0]) / np.mean(np.sum(term_counts.toarray(), axis=1))))

        # Calcula o numerador da fórmula BM25 para cada documento
        scores = ((k1 + 1) * tfidf).multiply(query_vec * idf)
        scores = np.asarray(scores.sum(axis=1)).reshape(-1)
        scores /= K + scores


        # Adiciona os pesos BM25 para a consulta atual à lista de resultados
        weights.append(scores)

    return weights




In [5]:
# Carrega o corpus e as consultas
corpus_path = 'CISI.ALL'
query_path = 'CISI.QRY'
corpus, queries = load_cisi(corpus_path, query_path)

# Calcula os pesos BM25 para o corpus e as consultas
bm25_weights = compute_bm25_weights(corpus, queries['query'].tolist())

# Exibe os resultados
for i, query in enumerate(queries['query']):
    print(f"Query {i + 1}: {query}")


  queries['query'] = queries['query'].str.replace('^ *(\d+)', '')


Query 1: .I 1
Query 2: What problems and concerns are there in making up descriptive titles?
Query 3: approximate titles?
Query 4: .I 2
Query 5: How can actually pertinent data, as opposed to references or entire articles
Query 6: .I 3
Query 7: What is information science?  Give definitions where possible.
Query 8: .W
Query 9: transforming printed text into computer-ready form.
Query 10: .W
Query 11: information management and unobstructed use of information retrieval systems?
Query 12: .I 6
Query 13: What possibilities are there for verbal communication between computers and
Query 14: .I 7
Query 15: Describe presently working and planned systems for publishing and printing
Query 16: data-processing form, for further use in retrieval.
Query 17: .W
Query 18: What bearing does it have on the science in general?
Query 19: .W
Query 20: of articles for inclusion in an information retrieval system?
Query 21: .W
Query 22: .I 11
Query 23: What is the need for information consolidation, evaluat