# Task 1-4
Configure a document store based on Faiss supported by multilingual E5 model.

Load the documents (passages) from the FiQA corpus.

In [1]:
import pandas as pd

from haystack.document_stores.faiss import FAISSDocumentStore
from haystack.nodes import DensePassageRetriever
from haystack.pipelines import DocumentSearchPipeline

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
corpus_df = pd.read_json(f"./data/corpus.jsonl", lines=True)
corpus_df = corpus_df.set_index("_id").sort_index()
corpus_df.head()

Unnamed: 0_level_0,title,text,metadata
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,,"Nie mówię, że nie podoba mi się też pomysł szk...",{}
31,,Tak więc nic nie zapobiega fałszywym ocenom po...,{}
56,,Nigdy nie możesz korzystać z FSA dla indywidua...,{}
59,,Samsung stworzył LCD i inne technologie płaski...,{}
63,,Oto wymagania SEC: Federalne przepisy dotycząc...,{}


In [3]:
queries_df = pd.read_json(f"./data/queries.jsonl", lines=True)
queries_df = queries_df.set_index("_id").sort_index()
queries_df.head()

Unnamed: 0_level_0,text,metadata
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Co jest uważane za wydatek służbowy w podróży ...,{}
1,Zgłaszanie wydatków biznesowych dla firmy bez ...,{}
2,Przekazywanie pieniędzy z jednej kontroli bizn...,{}
3,Posiadanie oddzielnego konta bankowego do prow...,{}
4,Wydatki służbowe - ubezpieczenie samochodu pod...,{}


In [4]:
qa_test_df = pd.read_csv(f"./data/test.tsv", sep="\t")
print(f"Number of test positive examples: {len(qa_test_df)}")

qa_test_df.head()

Number of test positive examples: 1706


Unnamed: 0,query-id,corpus-id,score
0,8,566392,1
1,8,65404,1
2,15,325273,1
3,18,88124,1
4,26,285255,1


In [5]:
test_doc_ids = set([row["corpus-id"] for _, row in qa_test_df.iterrows()])
test_corpus_df = corpus_df.loc[list(test_doc_ids)]

In [6]:
import os


faiss_path = "faiss"

if not os.path.exists(faiss_path):
    os.mkdir(faiss_path)

silver_encoder_model = "ipipan/silver-retriever-base-v1"
e5_large_encoder_model = "intfloat/multilingual-e5-large"


def get_retriever(document_store: FAISSDocumentStore, model: str) -> DensePassageRetriever:
    return DensePassageRetriever(
        document_store=document_store,
        query_embedding_model=model,
        passage_embedding_model=model,
        use_gpu=torch.cuda.is_available(),
        embed_title=True,
    )


def prepare_retriever(index_name: str, model: str, embedding_dim: int) -> DensePassageRetriever:
    index_path = f"{faiss_path}/{index_name}"

    try:
        document_store = FAISSDocumentStore(
            sql_url=f"sqlite:///{index_path}_faiss_document_store.d",
            faiss_index_factory_str="Flat",
            return_embedding=True,
            embedding_dim=embedding_dim,
        )

        retriever = get_retriever(document_store, model)

        passages_json = [
            {"content": row["text"], "meta": {"fiqa_id": idx}} 
            for idx, row in test_corpus_df.iterrows()
        ]

        document_store.write_documents(passages_json)
        document_store.update_embeddings(retriever=retriever)
        document_store.save(index_path)
    except:
        document_store = FAISSDocumentStore.load(index_path)

        retriever = get_retriever(document_store, model)

    return retriever

In [7]:
silver_retriever = prepare_retriever("ds_silver", silver_encoder_model, 768)
e5_large_retriever = prepare_retriever("ds_e5_large", e5_large_encoder_model, 1024)

  return self.fget.__get__(instance, owner)()
Using a model of type 'xlm-roberta' which might be incompatible with DPR encoders. Only Bert-based encoders are supported. They need input_ids, token_type_ids, attention_mask as input tensors.
Using a model of type 'xlm-roberta' which might be incompatible with DPR encoders. Only Bert-based encoders are supported. They need input_ids, token_type_ids, attention_mask as input tensors.


# Task 5-6 and 8
Use the set of questions and the scorings defined in this corpus, to compute NDCG@5 for the dense retriever.

Use a different dense encoder, e.g. E5 large or Polish Roberta Base and compute NDCG@5.

In [8]:
silver_pipe = DocumentSearchPipeline(retriever=silver_retriever)
e5_large_pipe = DocumentSearchPipeline(retriever=e5_large_retriever)

In [9]:
from abc import ABC, abstractmethod
import pandas as pd


class SearchEngine(ABC):
    @abstractmethod
    def get_top_searches(self, query: str, limit: int) -> pd.DataFrame:
        pass

In [10]:
class DPRSearchEngine(SearchEngine):
    def __init__(self, pipe: DocumentSearchPipeline) -> None:
        self._pipe = pipe

    def get_top_searches(self, query: str, limit: int) -> pd.DataFrame:
        prediction = self._pipe.run(query=query, params={"Retriever": {"top_k": limit}})

        ids = [doc.meta["fiqa_id"] for doc in prediction["documents"]]
        docs = [doc.content for doc in prediction["documents"]]

        return pd.DataFrame({"id": ids, "text": docs}).set_index("id")

In [11]:
silver_dpr_search_engine = DPRSearchEngine(silver_pipe)
e5_large_dpr_search_engine = DPRSearchEngine(e5_large_pipe)

In [12]:
_query = queries_df.iloc[8]["text"]

print(f"Query: {_query}")

silver_dpr_search_engine.get_top_searches(_query, 10)

Query: Jak zdeponować czek wystawiony na współpracownika w mojej firmie na moje konto firmowe?


Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
65404,Po prostu poproś współpracownika o podpisanie ...
342212,"Byłem właścicielem, a także najemcą. Mogłem wp..."
29372,"„Powiedzmy, że jesteś mi winien 123,00 USD i c..."
213331,„Twój przyjaciel prawdopodobnie nie może wpłac...
566392,Poproś o ponowne wystawienie czeku właściwemu ...
73427,Środki zarobione i wydane przed otwarciem dedy...
64138,„Wypisałbym im czek lub wręczyłbym im gotówkę....
555486,„1.Dlaczego nie ma adnotacji „„Skarbu Stanów Z...
296769,Zwykle otrzymuję czek kasjerski na pokrycie ok...
108739,Możesz zapłacić czekiem kasjerskim lub czekiem...


In [13]:
e5_large_dpr_search_engine.get_top_searches(_query, 10)

Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
65404,Po prostu poproś współpracownika o podpisanie ...
525200,"Nie zrobiłbym tego. Istnieje ryzyko, że Twój c..."
566392,Poproś o ponowne wystawienie czeku właściwemu ...
590102,Kiedy firma prosi mnie o wystawienie czeku na ...
267362,"Sprawdź oszustwo. Firmy, które mają nieodebran..."
29372,"„Powiedzmy, że jesteś mi winien 123,00 USD i c..."
220691,"W Wielkiej Brytanii oficjalną zasadą jest to, ..."
342212,"Byłem właścicielem, a także najemcą. Mogłem wp..."
89326,"Czeki są zwykle numerowane sekwencyjnie, aby z..."
388147,"„Możesz spróbować napisać na odwrocie czeku, w..."


In [14]:
import numpy as np
from tqdm import tqdm


class NDCGBenchmark:
    def __init__(
        self, 
        queries: pd.DataFrame, 
        positive_qa: pd.DataFrame, 
    ) -> None:
        self._queries = queries
        self._scores_map = {}
        
        for _, row in positive_qa.iterrows():
            query_id = row["query-id"]
            doc_id = row["corpus-id"]
            
            if query_id not in self._scores_map:
                self._scores_map[query_id] = dict()

            self._scores_map[query_id][doc_id] = 1

    def _eval_search_results(self, query_id: int, search_engine: SearchEngine, N: int) -> list[int]:
        query = self._queries.loc[query_id]["text"]
        results = search_engine.get_top_searches(query, N)
        return [self._scores_map[query_id].get(corpus_id, 0) for corpus_id in results.index]
    
    def _eval_queries(self, search_engine: SearchEngine, N: int) -> np.ndarray:
        num_queries = len(self._scores_map)
        scores = np.empty((num_queries, N), dtype=int)

        for i, query_id in tqdm(enumerate(self._scores_map), "Eval queries"):
            scores[i] = self._eval_search_results(query_id, search_engine, N)

        return scores
    
    def _target_scores(self, N: int) -> np.ndarray:
        num_queries = len(self._scores_map)
        scores = np.zeros((num_queries, N), dtype=int)

        for i, targets in tqdm(enumerate(self._scores_map.values()), "Eval targets"):
            num_targets = min(len(targets), N)
            scores[i, :num_targets] = 1

        return scores
    
    def mean_ndcg(self, search_engine: SearchEngine, N: int) -> float:
        predictions = self._eval_queries(search_engine, N)
        targets = self._target_scores(N)

        dcg_weights = np.log2(np.arange(2, N + 2))
        dcg_weights = np.resize(dcg_weights, predictions.shape)
        dcg = np.sum(predictions / dcg_weights, axis=1)
        idcg = np.sum(targets / dcg_weights, axis=1)
        ndcg = dcg / idcg

        return ndcg.mean()

In [15]:
ndcg_benchmark = NDCGBenchmark(queries_df, qa_test_df)

N = 5

In [16]:
silver_dpr_ndcg = ndcg_benchmark.mean_ndcg(silver_dpr_search_engine, N)

print(f"NDCG@{N} for silver retriever base is: {silver_dpr_ndcg}")

Eval queries: 648it [00:39, 16.23it/s]
Eval targets: 648it [00:00, ?it/s]

NDCG@5 for silver retriever base is: 0.43153572367707693





In [17]:
e5_large_dpr_ndcg = ndcg_benchmark.mean_ndcg(e5_large_dpr_search_engine, N)

print(f"NDCG@{N} for E5 large is: {e5_large_dpr_ndcg}")

Eval queries: 648it [01:58,  5.45it/s]
Eval targets: 648it [00:00, 648975.40it/s]

NDCG@5 for E5 large is: 0.3570620342046728





# Task 7
Combine dense retrieval with classification model from lab 6 to implement a two-step retrieval. Compute NDCG@5 for this combined model.

In [18]:
from transformers import BertForSequenceClassification
from transformers import PreTrainedTokenizer

In [19]:
def merge_query_and_doc(query: str, doc: str) -> str:
    return f"Pytanie: {query} Odpowiedź: {doc}"


class ClassifierSupportedSearchEngine(SearchEngine):
    def __init__(
        self, 
        search_engine: SearchEngine, 
        classifier: BertForSequenceClassification,
        tokenizer: PreTrainedTokenizer,
        num_candidates: int = 30,
    ) -> None:
        self._wrapped_engine = search_engine
        self._classifier = classifier
        self._tokenizer = tokenizer
        self._num_candidates = num_candidates

    def get_top_searches(self, query: str, limit: int) -> pd.DataFrame:
        results = self._wrapped_engine.get_top_searches(
            query, max(self._num_candidates, limit)
        )
        re_ranked_results = self._re_rank(query, results)
        return re_ranked_results.head(limit)
    
    def _re_rank(self, query: str, results: pd.DataFrame) -> pd.DataFrame:
        data = []
        scores = {}
        texts = []

        for id, row in results.iterrows():
            doc = row["text"] 
            data.append({"id": id, "text": doc})
            text = merge_query_and_doc(query, doc)
            texts.append(text)
            
        tokens = self._tokenizer(
            texts, 
            max_length=512, 
            padding=True,
            truncation=True,
            return_tensors="pt",
        )

        with torch.no_grad():
            outputs = self._classifier(**tokens)

        for id, score in zip(results.index, outputs.logits):
            scores[id] = score[1].item()

        data = sorted(data, key=lambda item: scores[item["id"]], reverse=True)

        return pd.DataFrame(data).set_index("id")

In [20]:
from transformers import AutoModelForSequenceClassification


path_to_best = f"models/qa_classifier/output/checkpoint-9000"

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(
    path_to_best, num_labels=2
)

In [21]:
from transformers import AutoTokenizer


model_name = "allegro/herbert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [22]:
classifier_search_engine = ClassifierSupportedSearchEngine(
    silver_dpr_search_engine, fine_tuned_model, tokenizer, num_candidates=30
)

In [23]:
classifier_re_ranked_silver_dpr_ndcq = ndcg_benchmark.mean_ndcg(classifier_search_engine, N)

print(f"NDCG@{N} for silver retriever base re-ranked by sequence classifier is: {classifier_re_ranked_silver_dpr_ndcq}")

Eval queries: 648it [2:05:26, 11.61s/it]
Eval targets: 648it [00:00, ?it/s]

NDCG@5 for silver retriever base re-ranked by sequence classifier is: 0.4545921502572553





# Questions

## Which of the methods: lexical match (e.g. ElasticSearch) or dense representation works better?

I achieved the highest NDCG@5 for Silver Retriever Base (about 0.432) - it is quite better than NDCG@5 for ElasticSearch from lab6 (a.401). However, E5 large was achieved the lowest score - about 0.357 NDCG@5. It is interesting that re-ranking with classification model from lab6 gives better result with ElasticSearch than with Silver Retriever Base as a first step search (0.490 vs 0455 NDCG@5)

## Which of the methods is faster?

It of course depends on the model size, but DPR is slower method than ElasticSearch in most cases.

## Try to determine the other pros and cons of using lexical search and dense document retrieval models.

Lexical search offers poor handling of synonyms and variants - we need to specify them manually. DPR models are able to catch these cases, and in general are better in semantic understanding. On the other hand, these models cost more than simple lexical search - we need to fine tune them on proper data, while the increase in results could be low.