# Prepare environment

In [1]:
# common imports
import numpy as np
import pandas as pd

# colab or locally?
try:
    from google.colab import drive
    
    drive.mount("/content/drive/")

    resources_dir = "/content/drive/MyDrive/projects/nlp"

    data_dir = f"{resources_dir}/data"
    models_dir = f"{resources_dir}/models"

    packages = ["sacremoses", "datasets", "evaluate", "transformers[torch]"]

    import subprocess
    import sys

    for package in packages:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

except:
    data_dir = f"./data"
    models_dir = f"./models"

# Task 1-3
Create a dataset of positive and negative sentence pairs.
The dataset should be split into training, evaluation and testing subsets.

## Load data

In [2]:
corpus_df = pd.read_json(f"{data_dir}/corpus.jsonl", lines=True)
corpus_df = corpus_df.set_index("_id").sort_index()
corpus_df.head()

Unnamed: 0_level_0,title,text,metadata
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,,"Nie mówię, że nie podoba mi się też pomysł szk...",{}
31,,Tak więc nic nie zapobiega fałszywym ocenom po...,{}
56,,Nigdy nie możesz korzystać z FSA dla indywidua...,{}
59,,Samsung stworzył LCD i inne technologie płaski...,{}
63,,Oto wymagania SEC: Federalne przepisy dotycząc...,{}


In [3]:
queries_df = pd.read_json(f"{data_dir}/queries.jsonl", lines=True)
queries_df = queries_df.set_index("_id").sort_index()
queries_df.head()

Unnamed: 0_level_0,text,metadata
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Co jest uważane za wydatek służbowy w podróży ...,{}
1,Zgłaszanie wydatków biznesowych dla firmy bez ...,{}
2,Przekazywanie pieniędzy z jednej kontroli bizn...,{}
3,Posiadanie oddzielnego konta bankowego do prow...,{}
4,Wydatki służbowe - ubezpieczenie samochodu pod...,{}


In [4]:
qa_train_all_df = pd.read_csv(f"{data_dir}/train.tsv", sep="\t")
print(f"Number of all train positive examples: {len(qa_train_all_df)}")

qa_train_all_df.head()

Number of all train positive examples: 14166


Unnamed: 0,query-id,corpus-id,score
0,0,18850,1
1,4,196463,1
2,5,69306,1
3,6,560251,1
4,6,188530,1


In [5]:
qa_test_df = pd.read_csv(f"{data_dir}/test.tsv", sep="\t")
print(f"Number of test positive examples: {len(qa_test_df)}")

qa_test_df.head()

Number of test positive examples: 1706


Unnamed: 0,query-id,corpus-id,score
0,8,566392,1
1,8,65404,1
2,15,325273,1
3,18,88124,1
4,26,285255,1


In [6]:
from sklearn.model_selection import train_test_split


qa_train_df, qa_validation_df = train_test_split(
    qa_train_all_df, test_size=0.2, random_state=3242
)

print(f"Number of train positive examples: {len(qa_train_df)}")
print(f"Number of validation positive examples: {len(qa_validation_df)}")

Number of train positive examples: 11332
Number of validation positive examples: 2834


In [7]:
train_doc_ids = set([row["corpus-id"] for _, row in qa_train_df.iterrows()])
validation_doc_ids = set([row["corpus-id"] for _, row in qa_validation_df.iterrows()])
test_doc_ids = set([row["corpus-id"] for _, row in qa_test_df.iterrows()])

train_corpus_df = corpus_df.loc[list(train_doc_ids)]
validation_corpus_df = corpus_df.loc[list(validation_doc_ids)]
test_corpus_df = corpus_df.loc[list(test_doc_ids)]

## Set up ElasticSearch

In [8]:
from abc import ABC, abstractmethod


class SearchEngine(ABC):
    @abstractmethod
    def get_top_searches(self, query: str, limit: int) -> pd.DataFrame:
        pass

In [9]:
from elasticsearch import Elasticsearch, helpers


class ESSearchEngine(SearchEngine):
    def __init__(self, index_name: str, corpus: pd.DataFrame) -> None:
        """corpus should contain column 'text' with documents."""
        self._es = Elasticsearch(
            "https://localhost:9200",
            basic_auth=("elastic", "Ay+zsdo6Y02ThQs7SCFM"),
            ca_certs="./elasticsearch/http_ca.crt",
        )
        self._index_name = index_name
        self._create_index()
        self._load_data(corpus)

    def _create_index(self) -> None:
        index_settings = {
            "settings": {
                "analysis": {
                    "analyzer": {
                        "analyze_lemma": {
                            "tokenizer": "standard",
                            "filter": [
                                "lowercase",
                                "morfologik_stem", 
                                "lowercase",
                            ]
                        },
                    },
                }
            },
            "mappings": {
                "properties": {
                    "text": {
                        "type": "text",
                        "analyzer": "analyze_lemma",
                    },
                    "id": {
                        "type": "text",
                    }
                }
            }
        }

        try:
            self._es.indices.delete(index=self._index_name)
        except:
            pass

        self._es.indices.create(index=self._index_name, body=index_settings) # type: ignore

    def _load_data(self, corpus: pd.DataFrame) -> None:
        docs = []

        for id, row in corpus.iterrows():
            item = {
                "_index": self._index_name,
                "_source": {
                    "text": row["text"],
                    "id": id,
                }
            }
            
            docs.append(item)

        helpers.bulk(self._es, docs)

    def get_top_searches(self, query: str, limit: int) -> pd.DataFrame:
        query_body = {
            "size": limit,
            "query": {
                "match": {
                    "text": query
                }
            }
        }

        response = self._es.search(index=self._index_name, body=query_body) # type: ignore

        ids = [item["_source"]["id"] for item in response["hits"]["hits"]]
        docs = [item["_source"]["text"] for item in response["hits"]["hits"]]

        return pd.DataFrame({"id": ids, "text": docs}).set_index("id")

In [10]:
es_train_search_engine = ESSearchEngine("train", train_corpus_df)
es_validation_search_engine = ESSearchEngine("validation", validation_corpus_df)
es_test_search_engine = ESSearchEngine("test", test_corpus_df)

  self._es.indices.create(index=self._index_name, body=index_settings) # type: ignore


In [11]:
_query = queries_df.iloc[8]["text"]

print(f"Query: {_query}")

es_train_search_engine.get_top_searches(_query, 10)

Query: Jak zdeponować czek wystawiony na współpracownika w mojej firmie na moje konto firmowe?


  response = self._es.search(index=self._index_name, body=query_body) # type: ignore


Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
176017,„Czeki (w każdym razie w USA) są ważne tylko p...
316359,"Z mojego doświadczenia wynika, że ​​nie musisz..."
580624,"Bank nie pozwoli Ci na to, ponieważ: Różnice w..."
445739,„Jak/kiedy mój pracodawca dowiaduje się o tym?...
28974,„Zgadzam się z resztą odpowiedzi – prawdopodob...
108734,"„Mamy lokalny bank, który przeszedł na usługę ..."
165397,„Najlepszym powodem poparcia czeku jest jego z...
334902,"Nie ma powodu, aby otwierać firmę. Pomoże Ci j..."
456636,Rachunek bieżący oferuje wiele korzyści dla os...
271116,"Absolutne oszustwo. Za każdym razem, gdy ktoś ..."


## Positive and negative pairs using ElasticSearch

In [12]:
def merge_query_and_doc(query: str, doc: str) -> str:
    return f"Pytanie: {query} Odpowiedź: {doc}"

In [13]:
from random import choices


def _queries_to_docs_map(qa: pd.DataFrame) -> dict[int, list[int]]:
    qa_map = {}

    for _, (query_id, doc_id, _) in qa.iterrows():
        if query_id in qa_map:
            qa_map[query_id].append(doc_id)
        else:
            qa_map[query_id] = [doc_id]

    return qa_map


def _query_df(
        search_engine: SearchEngine,
        query: str, 
        positive_ids: list[int],
        docs: pd.DataFrame, 
        negative_candidates_factor: int,
        negative_examples_factor: int,
    ) -> pd.DataFrame:
        num_positive = len(positive_ids)

        candidates_ids = search_engine.get_top_searches(
            query, 
            (negative_candidates_factor * negative_examples_factor) + num_positive
        ).index

        negative_candidates_ids = list(candidates_ids.difference(positive_ids))
        
        negative_ids = choices(
            negative_candidates_ids, k=negative_examples_factor * num_positive
        )

        examples = {}

        examples["text"] = [ 
            merge_query_and_doc(query, docs.loc[id]["text"])
            for id in positive_ids + negative_ids
        ]
        examples["label"] = [1 for _ in positive_ids] + [0 for _ in negative_ids]

        return pd.DataFrame(examples)


def qa_pairs_df(
    search_engine: SearchEngine,
    queries: pd.DataFrame, 
    docs: pd.DataFrame, 
    positive_qa: pd.DataFrame, 
    negative_candidates_factor: int = 80,
    negative_examples_factor: int = 8,
) -> pd.DataFrame:
    qa_map = _queries_to_docs_map(positive_qa)

    examples = [
        _query_df(
            search_engine,
            queries.loc[query_id]["text"], 
            positive_ids,
            docs, 
            negative_candidates_factor,
            negative_examples_factor,
        ) 
        for query_id, positive_ids in qa_map.items()
    ]

    pairs = pd.concat(examples, ignore_index=True)    

    return pairs

In [14]:
train_pairs_df = qa_pairs_df(es_train_search_engine, queries_df, corpus_df, qa_train_df)
validation_pairs_df = qa_pairs_df(es_validation_search_engine, queries_df, corpus_df, qa_validation_df)
test_pairs_df = qa_pairs_df(es_test_search_engine, queries_df, corpus_df, qa_test_df)

print(f"Number of train examples: {len(train_pairs_df)}")
train_pairs_df.head()

  response = self._es.search(index=self._index_name, body=query_body) # type: ignore


Number of train examples: 101988


Unnamed: 0,text,label
0,Pytanie: Dlaczego warto pożyczyć pieniądze na ...,1
1,Pytanie: Dlaczego warto pożyczyć pieniądze na ...,1
2,Pytanie: Dlaczego warto pożyczyć pieniądze na ...,1
3,Pytanie: Dlaczego warto pożyczyć pieniądze na ...,0
4,Pytanie: Dlaczego warto pożyczyć pieniądze na ...,0


# Task 4-5
Train a text classifier using the Transformers library that distinguishes between the positive and the negative pairs.

## Load tokenizer

In [15]:
from transformers import AutoTokenizer


model_name = "allegro/herbert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


## Prepare datasets

In [16]:
def tokenize_function(examples):
    return tokenizer(examples["text"], max_length=512, truncation=True)

In [17]:
from datasets import Dataset, DatasetDict, load_from_disk


datasets_dir = f"{data_dir}/qa_classification"


def _load_datasets() -> DatasetDict:
    return DatasetDict({
        "train": load_from_disk(f"{datasets_dir}/train"),
        "validation": load_from_disk(f"{datasets_dir}/validation"),
        "test": load_from_disk(f"{datasets_dir}/test"),
    })


def _create_datasets() -> DatasetDict:
    datasets = DatasetDict({
        "train": Dataset.from_pandas(train_pairs_df),
        "validation": Dataset.from_pandas(validation_pairs_df),
        "test": Dataset.from_pandas(test_pairs_df),
    })
    datasets = datasets.map(tokenize_function, batched=True)
    return datasets


def get_datasets() -> DatasetDict:
    try:
        return _load_datasets()
    except:
        datasets =  _create_datasets()
        datasets.save_to_disk(datasets_dir)
        return datasets

In [18]:
datasets = get_datasets()
datasets["train"]

Map: 100%|██████████| 101988/101988 [00:24<00:00, 4142.33 examples/s]
Map: 100%|██████████| 25506/25506 [00:05<00:00, 4584.79 examples/s]
Map: 100%|██████████| 15354/15354 [00:03<00:00, 4516.63 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 101988/101988 [00:00<00:00, 1066643.09 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 25506/25506 [00:00<00:00, 1097072.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 15354/15354 [00:00<00:00, 997727.88 examples/s] 


Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 101988
})

## Load model

In [None]:
from transformers import AutoModelForSequenceClassification


model_name = "allegro/herbert-base-cased"

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2
)

In [None]:
for param in list(model.base_model.parameters())[:165]:
    param.requires_grad = False

for name, param in model.named_parameters():
    print(f"Parameter: {name}, Requires gradient: {param.requires_grad}")

## Run training

In [None]:
import evaluate


metric = evaluate.load("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding


arguments = TrainingArguments(
    output_dir=f"{models_dir}/qa_classifier/output",
    do_train=True,
    do_eval=True,
    evaluation_strategy="steps",
    eval_steps=1500,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=5e-05,
    warmup_ratio=0.1,
    weight_decay=1e-3,
    num_train_epochs=4,
    logging_first_step=True,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=1500,
    fp16=True,
)

collator = DataCollatorWithPadding(tokenizer, padding="longest")

trainer = Trainer(
    model=model,
    args=arguments,
    train_dataset=datasets["train"].shuffle(seed=4664), # type: ignore
    eval_dataset=datasets["validation"].shuffle(seed=4664), # type: ignore
    compute_metrics=compute_metrics, # type: ignore
    data_collator=collator,
)

trainer.train()

### Training report
| Step  | Training Loss | Validation Loss | F1       |
|-------|---------------|-----------------|----------|
| 1500  | 0.266200      | 0.262476        | 0.479271 |
| 3000  | 0.264700      | 0.221167        | 0.438429 |
| 4500  | 0.239600      | 0.202747        | 0.554478 |
| 6000  | 0.237600      | 0.210558        | 0.511858 |
| 7500  | 0.254500      | 0.197017        | 0.567064 |
| 9000  | 0.213700      | 0.190644        | 0.623537 |
| 10500 | 0.215900      | 0.205007        | 0.542047 |
| 12000 | 0.229500      | 0.196024        | 0.566897 |


# Task 6-7
Use the classifier as a re-ranker for finding the answers to the questions.

In [19]:
from torch import no_grad, backends, device


if backends.mps.is_available():
    current_device = device("mps")
else:
    current_device = device("cpu")
    
print(f"Device is {current_device}")

Device is mps


In [20]:
from transformers import BertForSequenceClassification
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast


class ClassifierSupportedSearchEngine(SearchEngine):
    def __init__(
        self, 
        search_engine: SearchEngine, 
        classifier: BertForSequenceClassification,
        tokenizer: PreTrainedTokenizer | PreTrainedTokenizerFast,
        num_candidates: int = 30,
    ) -> None:
        self._wrapped_engine = search_engine
        self._classifier = classifier
        self._tokenizer = tokenizer
        self._num_candidates = num_candidates

    def get_top_searches(self, query: str, limit: int) -> pd.DataFrame:
        results = self._wrapped_engine.get_top_searches(
            query, max(self._num_candidates, limit)
        )
        re_ranked_results = self._re_rank(query, results)
        return re_ranked_results.head(limit)
    
    def _re_rank(self, query: str, results: pd.DataFrame) -> pd.DataFrame:
        data = []
        scores = {}
        texts = []

        for id, row in results.iterrows():
            doc = row["text"] 
            data.append({"id": id, "text": doc})
            text = merge_query_and_doc(query, doc)
            texts.append(text)
            
        tokens = self._tokenizer(
            texts, 
            max_length=512, 
            padding=True,
            truncation=True,
            return_tensors="pt",
        ).to(current_device)

        with no_grad():
            outputs = self._classifier(**tokens)

        for id, score in zip(results.index, outputs.logits):
            scores[id] = score[1].item()

        data = sorted(data, key=lambda item: scores[item["id"]], reverse=True)

        return pd.DataFrame(data).set_index("id")

In [22]:
from transformers import AutoModelForSequenceClassification


path_to_best = f"{models_dir}/qa_classifier/output/checkpoint-9000"

fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(
    path_to_best, num_labels=2
).to(current_device)

In [23]:
classifier_search_engine = ClassifierSupportedSearchEngine(
    es_test_search_engine, fine_tuned_model, tokenizer, num_candidates=30
)

In [24]:
print(f"Query: {_query}")

classifier_search_engine.get_top_searches(_query, 10)

Query: Jak zdeponować czek wystawiony na współpracownika w mojej firmie na moje konto firmowe?


  response = self._es.search(index=self._index_name, body=query_body) # type: ignore


Unnamed: 0_level_0,text
id,Unnamed: 1_level_1
65404,Po prostu poproś współpracownika o podpisanie ...
590102,Kiedy firma prosi mnie o wystawienie czeku na ...
508754,"„Sprawdziłem w Bank of America i mówią, że JED..."
566392,Poproś o ponowne wystawienie czeku właściwemu ...
220691,"W Wielkiej Brytanii oficjalną zasadą jest to, ..."
89326,"Czeki są zwykle numerowane sekwencyjnie, aby z..."
555486,„1.Dlaczego nie ma adnotacji „„Skarbu Stanów Z...
342212,"Byłem właścicielem, a także najemcą. Mogłem wp..."
213331,„Twój przyjaciel prawdopodobnie nie może wpłac...
29372,"„Powiedzmy, że jesteś mi winien 123,00 USD i c..."


# Task 8
Compute how much the result of searching the passages improved over only FTS method.

In [25]:
class NDCGBenchmark:
    def __init__(
        self, 
        queries: pd.DataFrame, 
        positive_qa: pd.DataFrame, 
    ) -> None:
        self._queries = queries
        self._scores_map = {}
        
        for _, row in positive_qa.iterrows():
            query_id = row["query-id"]
            doc_id = row["corpus-id"]
            
            if query_id not in self._scores_map:
                self._scores_map[query_id] = dict()

            self._scores_map[query_id][doc_id] = 1

    def _eval_search_results(self, query_id: int, search_engine: SearchEngine, N: int) -> list[int]:
        query = self._queries.loc[query_id]["text"]
        results = search_engine.get_top_searches(query, N)
        return [self._scores_map[query_id].get(corpus_id, 0) for corpus_id in results.index]
    
    def _eval_queries(self, search_engine: SearchEngine, N: int) -> np.ndarray:
        num_queries = len(self._scores_map)
        scores = np.empty((num_queries, N), dtype=int)

        for i, query_id in enumerate(self._scores_map):
            scores[i] = self._eval_search_results(query_id, search_engine, N)

        return scores
    
    def _target_scores(self, N: int) -> np.ndarray:
        num_queries = len(self._scores_map)
        scores = np.zeros((num_queries, N), dtype=int)

        for i, targets in enumerate(self._scores_map.values()):
            num_targets = min(len(targets), N)
            scores[i, :num_targets] = 1

        return scores
    
    def mean_ndcg(self, search_engine: SearchEngine, N: int) -> float:
        predictions = self._eval_queries(search_engine, N)
        targets = self._target_scores(N)

        dcg_weights = np.log2(np.arange(2, N + 2))
        dcg_weights = np.resize(dcg_weights, predictions.shape)
        dcg = np.sum(predictions / dcg_weights, axis=1)
        idcg = np.sum(targets / dcg_weights, axis=1)
        ndcg = dcg / idcg

        return ndcg.mean()

In [26]:
ndcg_benchmark = NDCGBenchmark(queries_df, qa_test_df)

N = 5

In [27]:
fts_ndcg = ndcg_benchmark.mean_ndcg(es_test_search_engine, N)

print(f"NDCG@{N} for FTS is: {fts_ndcg}")

  response = self._es.search(index=self._index_name, body=query_body) # type: ignore


NDCG@5 for FTS is: 0.40103115292829833


In [28]:
classifier_re_reanked_fts_ndcq = ndcg_benchmark.mean_ndcg(classifier_search_engine, N)

print(f"NDCG@{N} for FTS re-ranked by sequence classifier is: {classifier_re_reanked_fts_ndcq}")

  response = self._es.search(index=self._index_name, body=query_body) # type: ignore


NDCG@5 for FTS re-ranked by sequence classifier is: 0.4895170370519111


# Questions

## Do you think simpler methods, like Bayesian bag-of-words model, would work for sentence-pair classification? Justify your answer.

Bayesian bag-of-words models typically represent text using word frequencies without considering word order or context. This simplistic representation might struggle with capturing the semantic or syntactic nuances present in sentence pairs, limiting its performance compared to models that capture contextual information like LLMs.

## What hyper-parameters you have selected for the training? What resources (papers, tutorial) you have consulted to select these hyper-parameters?

I decided to use warmup for the first 10% of the training. Additionally, I used weight_decay (0.001) to prevent overfitting. The learning rate was relatively low: 5e-5. It is worth mentioning that only the last two blocks of the transformer and the classifier layer were tensed - this accelerated the training and enabled the use of a larger batch_size (32). 

## Think about pros and cons of the neural-network models with respect to natural language processing. Provide at least 2 pros and 2 cons.

The advantages of using NN in NLP tasks are certainly the ability to capture the context of sentences, which translates into good results, and the fact that the models learn complex, hierarchical text representations (ebeddings), which allows them to be used for various tasks through fine-tuning.

The disadvantages are a large number of parameters, which make training long and consuming many resources, and a long evaluation time, which is a problem in applications where the response time should be short.