# Task 1-2
Set up ElasticSearch.

In [1]:
from elasticsearch import Elasticsearch, helpers

es = Elasticsearch(
    "https://localhost:9200",
    basic_auth=("elastic", "eDNC+ksvF7HSEkaAT-TL"),
    ca_certs="./elasticsearch/http_ca.crt",
)

info = es.info()
print(info)

{'name': '4cee569dd95c', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'C6HNYnebRPKkIaodGR2WZw', 'version': {'number': '8.8.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '98e1271edf932a480e4262a471281f1ee295ce6b', 'build_date': '2023-06-26T05:16:16.196344851Z', 'build_snapshot': False, 'lucene_version': '9.6.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


# Task 3-4
Define an ES analyzer for Polish texts containing:
- standard tokenizer
- synonym filter with alternative forms for months, e.g. wrzesień, wrz, IX.
- lowercase filter
- Morfologik-based lemmatizer
- lowercase filter (looks strange, but Morfologi produces capitalized base forms for proper names, so we have to lowercase them once more).

Define another analyzer for Polish, without the synonym filter.


In [2]:
months_synonyms_filter = {
    "months_synonyms": {
        "type": "synonym",
        "synonyms": [
            "styczeń, sty, I",
            "luty, lut, II",
            "marzec, mar, III",
            "kwiecień, kwi, IV",
            "maj, V",
            "czerwiec, cze, VI",
            "lipiec, lip, VII",
            "sierpień, sie, VIII",
            "wrzesień, wrz, IX",
            "październik, paź, X",
            "listopad, lis, XI",
            "grudzień, gru, XII"
        ]
    }
}


def polish_analyzer(synonyms: bool = True, lemma: bool = True):
    analyzer = {
        "tokenizer": "standard",
        "filter": [
            "lowercase"
        ]
    }

    if synonyms:
        analyzer["filter"] = analyzer["filter"] + ["months_synonyms"]

    if lemma:
        analyzer["filter"] = analyzer["filter"] + ["morfologik_stem", "lowercase"]

    return analyzer

polish_analyzer()

{'tokenizer': 'standard',
 'filter': ['lowercase', 'months_synonyms', 'morfologik_stem', 'lowercase']}

# Task 5
Define an ES index.

In [3]:
index_settings = {
    "settings": {
        "analysis": {
            "analyzer": {
                "analyze_synonyms": polish_analyzer(),
                "analyze_no_synonyms": polish_analyzer(synonyms=False),
                "analyze_no_lemma": polish_analyzer(lemma=False),
                "analyze_no_lemma_no_synonyms": polish_analyzer(synonyms=False, lemma=False),
            },
            "filter": months_synonyms_filter
        }
    },
    "mappings": {
        "properties": {
            "with_synonyms": {
                "type": "text",
                "analyzer": "analyze_synonyms",
            },
            "without_synonyms": {
                "type": "text",
                "analyzer": "analyze_no_synonyms",
            },
            "with_synonyms_no_lemma": {
                "type": "text",
                "analyzer": "analyze_no_lemma",
            },
            "without_synonyms_no_lemma": {
                "type": "text",
                "analyzer": "analyze_no_lemma_no_synonyms",
            },
            "id": {
                "type": "text",
            }
        }
    }
}

index_name = "index_lab2"

try:
    es.indices.delete(index=index_name)
except:
    pass

response = es.indices.create(index=index_name, body=index_settings)
response

  response = es.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'index_lab2'})

# Task 6
Load the data to the ES index.

In [4]:
import pandas as pd

In [5]:
corpus_df = pd.read_json("./data/corpus.jsonl", lines=True)
corpus_df = corpus_df.set_index('_id').sort_index()
corpus_df.head()

Unnamed: 0_level_0,title,text,metadata
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,,"Nie mówię, że nie podoba mi się też pomysł szk...",{}
31,,Tak więc nic nie zapobiega fałszywym ocenom po...,{}
56,,Nigdy nie możesz korzystać z FSA dla indywidua...,{}
59,,Samsung stworzył LCD i inne technologie płaski...,{}
63,,Oto wymagania SEC: Federalne przepisy dotycząc...,{}


In [6]:
texts = []

for id, row in corpus_df.iterrows():
    item = {
        "_index": index_name,
        "_source": {
            "with_synonyms": row["text"],
            "without_synonyms": row["text"],
            "with_synonyms_no_lemma": row["text"],
            "without_synonyms_no_lemma": row["text"],
            "id": id,
        }
    }
    
    texts.append(item)


success, _ = helpers.bulk(es, texts)
success

57638

# Task 7
Determine the number of documents containing the word styczeń (in any form) including and excluding the synonyms.

In [7]:
def get_number_of(word: str, include_synonyms: bool = True):
    field = "with_synonyms" if include_synonyms else "without_synonyms"

    query_body = {
        "query": {
            "match": {
                field: word
            }
        }
    }

    response = es.count(index=index_name, body=query_body)
 
    return response["count"]

In [8]:
january_with_synonyms_count = get_number_of("styczeń")
print(f"Number of \"styczeń\" word (with synonyms): {january_with_synonyms_count}")

  response = es.count(index=index_name, body=query_body)


Number of "styczeń" word (with synonyms): 44155


In [9]:
january_without_synonyms_count = get_number_of("styczeń", include_synonyms=False)
print(f"Number of \"styczeń\" word (with synonyms): {january_without_synonyms_count}")

Number of "styczeń" word (with synonyms): 329


  response = es.count(index=index_name, body=query_body)


# Task 8-9
Compute NDCG@5 for the QA dataset (the test subset) for the following setusp:
- synonyms enabled and disabled,
- lemmatization in the query enabled and disabled.

## Load test queries

In [10]:
queries_df = pd.read_json("./data/queries.jsonl", lines=True)
queries_df = queries_df.set_index("_id").sort_index()
queries_df.head()

Unnamed: 0_level_0,text,metadata
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Co jest uważane za wydatek służbowy w podróży ...,{}
1,Zgłaszanie wydatków biznesowych dla firmy bez ...,{}
2,Przekazywanie pieniędzy z jednej kontroli bizn...,{}
3,Posiadanie oddzielnego konta bankowego do prow...,{}
4,Wydatki służbowe - ubezpieczenie samochodu pod...,{}


In [11]:
qa_test_df = pd.read_csv("./data/test.tsv", sep="\t")
qa_test_df = qa_test_df.sort_values(by="query-id")
qa_test_df.head()

Unnamed: 0,query-id,corpus-id,score
0,8,566392,1
1,8,65404,1
2,15,325273,1
3,18,88124,1
4,26,285255,1


In [12]:
queries_df = queries_df[queries_df.index.isin(qa_test_df["query-id"])].reset_index()
queries_df.head()

Unnamed: 0,_id,text,metadata
0,8,Jak zdeponować czek wystawiony na współpracown...,{}
1,15,Czy mogę wysłać przekaz pieniężny z USPS jako ...,{}
2,18,1 EIN prowadzący działalność pod wieloma nazwa...,{}
3,26,Ubieganie się o kredyt biznesowy i otrzymywani...,{}
4,34,401k Przelew po zamknięciu firmy,{}


## NDCG@5

In [13]:
N = 5

def get_top_searches_for(query: str, limit: int, include_synonyms: bool = True, lemma: bool = True):
    field = "with_synonyms" if include_synonyms else "without_synonyms"
    if not lemma:
        field += "_no_lemma"
    

    query_body = {
        "size": limit,
        "query": {
            "match": {
                field: query
            }
        }
    }

    response = es.search(index=index_name, body=query_body)

    return [item["_source"]["id"] for item in response["hits"]["hits"]]


get_top_searches_for(queries_df.iloc[8]["text"], N)

  response = es.search(index=index_name, body=query_body)


[295522, 400230, 135196, 535207, 301161]

In [14]:
queries_corpuses_scores_map: dict[int, dict] = {}

for _, row in qa_test_df.iterrows():
    query_id = row["query-id"]
    corpus_id = row["corpus-id"]
    score = row["score"]

    if query_id not in queries_corpuses_scores_map:
        queries_corpuses_scores_map[query_id] = dict()
    
    queries_corpuses_scores_map[query_id][corpus_id] = score


def eval_search_results(query_id: int, results: list[int]):
    return [queries_corpuses_scores_map[query_id].get(corpus_id, 0) for corpus_id in results]


eval_search_results(8, get_top_searches_for(queries_df.iloc[0]["text"], N))

  response = es.search(index=index_name, body=query_body)


[0, 1, 0, 0, 0]

In [15]:
import numpy as np


def eval_queries_results(limit: int, include_synonyms: bool = True, lemma: bool = True):
    n_queries = len(queries_corpuses_scores_map)
    results = np.empty((n_queries, limit), dtype=int)

    for i, query_id in enumerate(queries_corpuses_scores_map):
        text = queries_df.loc[queries_df["_id"] == query_id, "text"].values[0]
        query_search_results = get_top_searches_for(text, N, include_synonyms,lemma)
        results[i] = eval_search_results(query_id, query_search_results)

    return results


eval_queries_results(N)[0]

  response = es.search(index=index_name, body=query_body)


array([0, 1, 0, 0, 0])

In [16]:
n_queries = len(queries_corpuses_scores_map)
target_results = np.zeros((n_queries, N), dtype=int)

for i, targets in enumerate(queries_corpuses_scores_map.values()):
    n_targets = len(targets)
    target_results[i, :n_targets] = 1

In [17]:
def mean_ndcg(limit: int, include_synonyms: bool = True, lemma: bool = True):
    predicted_results = eval_queries_results(limit, include_synonyms, lemma)

    dcg_weights = np.log2(np.arange(2, limit + 2))
    dcg_weights = np.resize(dcg_weights, predicted_results.shape)
    dcg = np.sum(predicted_results / dcg_weights, axis=1)
    idcg = np.sum(target_results / dcg_weights, axis=1)
    ndcg = dcg / idcg

    return ndcg.mean()

In [18]:
ndcg_with_synonyms = mean_ndcg(N)
print(f"Mean NDCG@{N} (search with synonyms and with lemma): {ndcg_with_synonyms}")

  response = es.search(index=index_name, body=query_body)


Mean NDCG@5 (search with synonyms and with lemma): 0.1858026377473443


In [19]:
ndcg_without_synonyms = mean_ndcg(N, include_synonyms=False)
print(f"Mean NDCG@{N} (search without synonyms and with lemma): {ndcg_without_synonyms}")

  response = es.search(index=index_name, body=query_body)


Mean NDCG@5 (search without synonyms and with lemma): 0.1851291130797741


In [20]:
ndcg_with_synonyms_no_lemma = mean_ndcg(N, lemma=False)
print(f"Mean NDCG@{N} (search with synonyms and without lemma): {ndcg_with_synonyms_no_lemma}")

  response = es.search(index=index_name, body=query_body)


Mean NDCG@5 (search with synonyms and without lemma): 0.13839287982649878


In [21]:
ndcg_without_synonyms_no_lemma = mean_ndcg(N, include_synonyms=False, lemma=False)
print(f"Mean NDCG@{N} (search without synonyms and without lemma): {ndcg_without_synonyms_no_lemma}")

  response = es.search(index=index_name, body=query_body)


Mean NDCG@5 (search without synonyms and without lemma): 0.13854570378524392


# Questions

## What are the strengths and weaknesses of regular expressions versus full text search regarding processing of text?

Regular expressions are great for finding specific patterns in text. However, they may become complex and challenging to maintain, lacking semantic understanding of natural language. On the other hand, Full Text Search  systems, while less precise in pattern matching, are better at understanding language, efficiently indexing large datasets, and ranking results for relevance.

## Is full text search applicable to the question answering problem? Show at least 3 examples from the corpus to support your claim.

Full-text search can be used to some extent in question answering problems, especially for searching documents containing key words. However, for more advanced tasks, especially those requiring understanding the context and semantics of natural language, it may turn out to be insufficient.