In [None]:
!pip install faiss-cpu
!pip install -U bitsandbytes
!pip install sentence-transformers
!pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import faiss
import json
import numpy as np
from sentence_transformers import SentenceTransformer

from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import CrossEncoder
from rank_bm25 import BM25Okapi
import torch

In [None]:
embedder = SentenceTransformer("sentence-transformers/paraphrase-multilingual-mpnet-base-v2")

chunks = json.load(open("/content/drive/MyDrive/prawnik_pracy/meta.json", "r", encoding="utf-8"))
index = faiss.read_index("/content/drive/MyDrive/prawnik_pracy/index.faiss")
eval_data = json.load(open("/content/drive/MyDrive/prawnik_pracy/questions.json","r",encoding="utf-8"))

In [None]:
bm25_corpus = [c["text"].lower().split() for c in chunks]
bm25 = BM25Okapi(bm25_corpus)

In [None]:
def bm25_search(query, k=20):
    scores = bm25.get_scores(query.lower().split())
    ranked_ids = np.argsort(scores)[::-1][:k]
    return [chunks[i] for i in ranked_ids if chunks[i].get("artykul")]

In [None]:
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [None]:
def rerank(query, docs, top_k=5):
    pairs = [(query, d["text"]) for d in docs]
    scores = reranker.predict(pairs)

    ranked = sorted(
        zip(docs, scores),
        key=lambda x: x[1],
        reverse=True
    )

    return [d for d, s in ranked[:top_k]]

In [None]:
def search(query, k=5):
    emb = embedder.encode([query])
    D, I = index.search(np.array(emb), 20)

    candidates = [
        chunks[i] for i in I[0]
        if chunks[i].get("artykul")
    ]

    return rerank(query, candidates, top_k=k)

In [None]:
def hybrid_search(query, k=5):
    faiss_results = search(query, k=20)
    bm25_results = bm25_search(query, k=20)

    seen = set()
    merged = []

    for r in faiss_results + bm25_results:
        key = r["id"]
        if key not in seen:
            seen.add(key)
            merged.append(r)

    return rerank(query, merged, top_k=k)

In [None]:
hits_at_5 = 0
mrr_scores = []

In [None]:
import re

hits_at_5 = 0
mrr_scores = []

for item in eval_data:
    results = hybrid_search(item["question"], k=5)

    expected = item["expected_article"]
    print(f"Expected: {repr(expected)}")

    ranks = [r["artykul"] for r in results]
    print(f"Ranks found: {[repr(x) for x in ranks]}")

    query_hit = False
    query_mrr = 0.0

    for i, r_text in enumerate(ranks):
        if r_text is not None:
            # Print for debugging to see if the expected article is found within the chunk text
            print(f"  Checking if {repr(expected)} in {repr(r_text)}: {expected in r_text}")
            if expected in r_text:
                query_hit = True
                query_mrr = 1 / (i + 1)
                break  # Found the first occurrence, no need to check further for this query

    if query_hit:
        hits_at_5 += 1  # Increment total queries with a hit

    mrr_scores.append(query_mrr)  # Append one MRR score for this query

Expected: 'Art. 1'
Ranks found: ["'Art. 1. Kodeks pracy określa prawa i obowiązki pracowników i pracodawców.'", "'Art. 5. Jeżeli stosunek pracy określonej kategorii pracowników regulują przepisy szczególne, przepisy kodeksu stosuje się'", "'Art. 9. § 1. Ilekroć w Kodeksie pracy jest mowa o prawie pracy, rozumie się przez to przepisy Kodeksu pracy oraz'", "'Art. 6732. W przypadku wykonywania pracy zdalnej wnioski pracownika, dla których przepisy kodeksu lub innych'", "'Art. 297. Minister Pracy i Polityki Socjalnej10) określi w drodze rozporządzenia:'"]
  Checking if 'Art. 1' in 'Art. 1. Kodeks pracy określa prawa i obowiązki pracowników i pracodawców.': True
Expected: 'Art. 9'
Ranks found: ["'Art. 10. § 1. Każdy ma prawo do swobodnie wybranej pracy. Nikomu, z wyjątkiem przypadków określonych w ustawie,'", "'Art. 187. § 1. Pracownica karmiąca dziecko piersią ma prawo do dwóch półgodzinnych przerw w pracy wliczanych'", "'Art. 77. § 1. Stosunek pracy między spółdzielnią pracy a jej członki

In [None]:
print("Hit@5:", hits_at_5 / len(eval_data))
print("MRR:", sum(mrr_scores)/len(mrr_scores))

Hit@5: 0.55
MRR: 0.41833333333333333
