In [1]:
!pip install faiss-cpu
!pip install -U bitsandbytes
!pip install sentence-transformers
!pip install rank-bm25



In [2]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [None]:
import faiss
import json
import numpy as np
from sentence_transformers import SentenceTransformer

from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import CrossEncoder
from rank_bm25 import BM25Okapi
import torch

In [None]:
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

chunks = json.load(open("/content/drive/MyDrive/prawnik_pracy/meta.json", "r", encoding="utf-8"))
index = faiss.read_index("/content/drive/MyDrive/prawnik_pracy/index.faiss")
eval_data = json.load(open("/content/drive/MyDrive/prawnik_pracy/questions.json","r",encoding="utf-8"))

In [None]:
bm25_corpus = [c["text"].lower().split() for c in chunks]
bm25 = BM25Okapi(bm25_corpus)

In [None]:
def bm25_search(query, k=20):
    scores = bm25.get_scores(query.lower().split())
    ranked_ids = np.argsort(scores)[::-1][:k]
    return [chunks[i] for i in ranked_ids if chunks[i].get("artykul")]

In [None]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [None]:
def rerank(query, docs, top_k=5):
    pairs = [(query, d["text"]) for d in docs]
    scores = reranker.predict(pairs)

    ranked = sorted(
        zip(docs, scores),
        key=lambda x: x[1],
        reverse=True
    )

    return [d for d, s in ranked[:top_k]]

In [None]:
def search(query, k=5):
    emb = embedder.encode([query])
    D, I = index.search(np.array(emb), 20)

    candidates = [
        chunks[i] for i in I[0]
        if chunks[i].get("artykul")
    ]

    return rerank(query, candidates, top_k=k)

In [None]:
def hybrid_search(query, k=5):
    faiss_results = search(query, k=20)
    bm25_results = bm25_search(query, k=20)

    seen = set()
    merged = []

    for r in faiss_results + bm25_results:
        key = r["id"]
        if key not in seen:
            seen.add(key)
            merged.append(r)

    return rerank(query, merged, top_k=k)

In [None]:
hits_at_5 = 0
mrr_scores = []

In [None]:
import re

hits_at_5 = 0
mrr_scores = []

for item in eval_data:
    results = hybrid_search(item["question"], k=5)

    expected = item["expected_article"]
    print(f"Expected: {repr(expected)}")

    ranks = [r["artykul"] for r in results]
    print(f"Ranks found: {[repr(x) for x in ranks]}")

    query_hit = False
    query_mrr = 0.0

    for i, r_text in enumerate(ranks):
        if r_text is not None:
            # Print for debugging to see if the expected article is found within the chunk text
            print(f"  Checking if {repr(expected)} in {repr(r_text)}: {expected in r_text}")
            if expected in r_text:
                query_hit = True
                query_mrr = 1 / (i + 1)
                break  # Found the first occurrence, no need to check further for this query

    if query_hit:
        hits_at_5 += 1  # Increment total queries with a hit

    mrr_scores.append(query_mrr)  # Append one MRR score for this query

In [None]:
print("Hit@5:", hits_at_5 / len(eval_data))
print("MRR:", sum(mrr_scores)/len(mrr_scores))