In [6]:
from elasticsearch import Elasticsearch
import json

# Connect to Elasticsearch
es = Elasticsearch(["http://localhost:9200"])

# Check if the connection was successful
if es.ping():
    print("Connected to Elasticsearch")
else:
    print("Could not connect to Elasticsearch")
    exit()




Connected to Elasticsearch


In [7]:
# Define a sample document
document = {
    "title": "Example Document",
    "content": "This is a sample document for Elasticsearch.",
    "tags": ["example", "demo", "elasticsearch"]
}

# Create an index (if it doesn't exist)
index_name = "demo_index"
if not es.indices.exists(index=index_name):
    es.indices.create(index=index_name)
    print(f"Created index: {index_name}")

# Index the document
response = es.index(index=index_name, body=document)
print(f"Document indexed: {response['result']}")

# Refresh the index to make the document searchable immediately
es.indices.refresh(index=index_name)

# Search for documents
search_query = {
    "query": {
        "match": {
            "content": "sample"
        }
    }
}

search_results = es.search(index=index_name, body=search_query)

# Print search results
print("\nSearch Results:")
for hit in search_results['hits']['hits']:
    print(f"ID: {hit['_id']}")
    print(f"Score: {hit['_score']}")
    print(f"Source: {json.dumps(hit['_source'], indent=2)}")
    print("---")

Document indexed: created

Search Results:
ID: -qIPyJIB8Q5--6byl4v_
Score: 0.03922071
Source: {
  "title": "Example Document",
  "content": "This is a sample document for Elasticsearch.",
  "tags": [
    "example",
    "demo",
    "elasticsearch"
  ]
}
---
ID: -6I8yJIB8Q5--6byIYsu
Score: 0.03922071
Source: {
  "title": "Example Document",
  "content": "This is a sample document for Elasticsearch.",
  "tags": [
    "example",
    "demo",
    "elasticsearch"
  ]
}
---
ID: _KJAyJIB8Q5--6byy4uW
Score: 0.03922071
Source: {
  "title": "Example Document",
  "content": "This is a sample document for Elasticsearch.",
  "tags": [
    "example",
    "demo",
    "elasticsearch"
  ]
}
---
ID: I6NYyJIB8Q5--6bycG1a
Score: 0.03922071
Source: {
  "title": "Example Document",
  "content": "This is a sample document for Elasticsearch.",
  "tags": [
    "example",
    "demo",
    "elasticsearch"
  ]
}
---
ID: eqRnyJIB8Q5--6byScOp
Score: 0.03922071
Source: {
  "title": "Example Document",
  "content": "Thi

In [8]:
polish_analyzer = {
    "analysis": {
        "filter": {
            "polish_month_synonyms": {
                "type": "synonym",
                "synonyms": [
                    "styczeń, sty, I",
                    "luty, lut, II",
                    "marzec, mar, III",
                    "kwiecień, kwi, IV",
                    "maj, V",
                    "czerwiec, cze, VI",
                    "lipiec, lip, VII",
                    "sierpień, sie, VIII",
                    "wrzesień, wrz, IX",
                    "październik, paź, X",
                    "listopad, lis, XI",
                    "grudzień, gru, XII"
                ]
            }
        },
        "analyzer": {
            "polish_custom": {  
                "type": "custom",
                "tokenizer": "standard",
                "filter": [
                    "polish_month_synonyms",
                    "lowercase",
                    "morfologik_stem"
                ]
            }
        }
    }
}

In [9]:
# Define a basic Polish analyzer without synonyms
basic_polish_analyzer = {
    "analysis": {
        "analyzer": {
            "polish_basic": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": [
                    "lowercase",
                    "morfologik_stem"
                ]
            }
        }
    }
}

In [10]:
fiqa_pl_index_mapping = {
    "mappings": {
        "properties": {
            "text_with_synonyms": {"type": "text", "analyzer": "polish_custom"},
            "text_without_synonyms": {"type": "text", "analyzer": "polish_basic"}
        }
    }
}

fiqa_pl_index_settings = {
    "settings": {
        "analysis": {
            "filter": polish_analyzer["analysis"]["filter"],
            "analyzer": {
                **polish_analyzer["analysis"]["analyzer"],
                **basic_polish_analyzer["analysis"]["analyzer"]
            }
        }
    },
    "mappings": fiqa_pl_index_mapping["mappings"]
}


# Create or update the index with the new settings and mapping
fiqa_pl_index_name = "fiqa_pl_corpus"
if es.indices.exists(index=fiqa_pl_index_name):
    es.indices.close(index=fiqa_pl_index_name)
    es.indices.put_settings(index=fiqa_pl_index_name, body=fiqa_pl_index_settings["settings"])
    es.indices.open(index=fiqa_pl_index_name)
    print(f"Updated index {fiqa_pl_index_name} with new analyzers and mappings")
else:
    es.indices.create(index=fiqa_pl_index_name, body=fiqa_pl_index_settings)
    print(f"Created index {fiqa_pl_index_name} with custom Polish analyzers")


Updated index fiqa_pl_corpus with new analyzers and mappings


In [11]:
from datasets import load_dataset

# Load the FiQA-PL corpus dataset
dataset = load_dataset('clarin-knext/fiqa-pl', 'corpus')


In [12]:
es = Elasticsearch(["http://localhost:9200"])

# Get all settings
all_settings = es.cluster.get_settings()
print(all_settings)

{'persistent': {}, 'transient': {}}


In [13]:
import pandas as pd
from elasticsearch.helpers import bulk

df = pd.DataFrame(dataset)


# Function to generate actions for bulk indexing
def generate_actions(documents):
    for doc in documents:
        yield {
            "_index": fiqa_pl_index_name,
            "_source": {
                "title": doc["title"],
                "text_with_synonyms": doc["text"],
                "text_without_synonyms": doc["text"]
            }
        }

# Bulk indexing function
def bulk_index_documents(documents):
    try:
        success, failed = bulk(es, generate_actions(documents))
        print(f"Indexed {success} documents, {failed} failed")
    except Exception as e:
        print(f"Error during indexing: {e}")


# Perform bulk indexing
bulk_index_documents(df.corpus)

# Refresh the index
es.indices.refresh(index=fiqa_pl_index_name)

print(f"Finished indexing FiQA-PL corpus to {fiqa_pl_index_name}")

Error during indexing: Connection timed out
Finished indexing FiQA-PL corpus to fiqa_pl_corpus


In [14]:
# Get total number of documents in the index
total_docs = es.count(index=fiqa_pl_index_name)['count']
print(f"Total number of documents in the index: {total_docs}")

# Search for 'kwiecień' without synonyms
query_without_synonyms = {
    "query": {
        "match": {
            "text_without_synonyms": "kwiecień"
        }
    }
}

result_without_synonyms = es.search(index=fiqa_pl_index_name, body=query_without_synonyms)
matches_without_synonyms = result_without_synonyms['hits']['total']['value']
print(f"Number of matches for 'kwiecień' (without synonyms): {matches_without_synonyms}")

# Search for 'kwiecień' with synonyms
query_with_synonyms = {
    "query": {
        "match": {
            "text_with_synonyms": "kwiecień"
        }
    }
}

result_with_synonyms = es.search(index=fiqa_pl_index_name, body=query_with_synonyms)
matches_with_synonyms = result_with_synonyms['hits']['total']['value']
print(f"Number of matches for 'kwiecień' (including synonyms): {matches_with_synonyms}")

# Calculate the difference
difference = matches_with_synonyms - matches_without_synonyms
print(f"Additional matches found using synonyms: {difference}")

Total number of documents in the index: 115776
Number of matches for 'kwiecień' (without synonyms): 262
Number of matches for 'kwiecień' (including synonyms): 311
Additional matches found using synonyms: 49


In [15]:
from datasets import load_dataset

queries_dataset = load_dataset("clarin-knext/fiqa-pl", "queries")
qa_pairs_dataset = load_dataset("clarin-knext/fiqa-pl-qrels")

In [16]:
print(queries_dataset)
print(qa_pairs_dataset)

DatasetDict({
    queries: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 6648
    })
})
DatasetDict({
    train: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 14166
    })
    validation: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 1238
    })
    test: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 1706
    })
})


In [17]:
# Define analyzers with and without lemmatization and synonyms
analyzers = {
    "analysis": {
        "filter": {
            "polish_month_synonyms": {
                "type": "synonym",
                "synonyms": [
                    "styczeń, sty, I",
                    "luty, lut, II",
                    "marzec, mar, III",
                    "kwiecień, kwi, IV",
                    "maj, V",
                    "czerwiec, cze, VI",
                    "lipiec, lip, VII",
                    "sierpień, sie, VIII",
                    "wrzesień, wrz, IX",
                    "październik, paź, X",
                    "listopad, lis, XI",
                    "grudzień, gru, XII"
                ]
            }
        },
        "analyzer": {
            "polish_custom_with_lemma": {  
                "type": "custom",
                "tokenizer": "standard",
                "filter": [
                    "polish_month_synonyms",
                    "lowercase",
                    "morfologik_stem"
                ]
            },
            "polish_basic_with_lemma": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": [
                    "lowercase",
                    "morfologik_stem"
                ]
            },
            "polish_custom_no_lemma": {  
                "type": "custom",
                "tokenizer": "standard",
                "filter": [
                    "polish_month_synonyms",
                    "lowercase"
                ]
            },
            "polish_basic_no_lemma": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": [
                    "lowercase"
                ]
            }
        }
    }
}

# Update the index mapping
fiqa_pl_index_mapping = {
    "mappings": {
        "properties": {
            "text_with_synonyms_with_lemma": {"type": "text", "analyzer": "polish_custom_with_lemma"},
            "text_without_synonyms_with_lemma": {"type": "text", "analyzer": "polish_basic_with_lemma"},
            "text_with_synonyms_no_lemma": {"type": "text", "analyzer": "polish_custom_no_lemma"},
            "text_without_synonyms_no_lemma": {"type": "text", "analyzer": "polish_basic_no_lemma"}
        }
    }
}

fiqa_pl_index_settings = {
    "settings": {
        "analysis": analyzers["analysis"]
    },
    "mappings": fiqa_pl_index_mapping["mappings"]
}

# Create or update the index with the new settings and mapping
fiqa_pl_index_name = "fiqa_pl_corpus"
if es.indices.exists(index=fiqa_pl_index_name):
    es.indices.close(index=fiqa_pl_index_name)
    es.indices.put_settings(index=fiqa_pl_index_name, body=fiqa_pl_index_settings["settings"])
    es.indices.put_mapping(index=fiqa_pl_index_name, body=fiqa_pl_index_settings["mappings"])
    es.indices.open(index=fiqa_pl_index_name)
    print(f"Updated index {fiqa_pl_index_name} with new analyzers and mappings")
else:
    es.indices.create(index=fiqa_pl_index_name, body=fiqa_pl_index_settings)
    print(f"Created index {fiqa_pl_index_name} with custom Polish analyzers")

Updated index fiqa_pl_corpus with new analyzers and mappings


In [18]:
def generate_actions(documents):
    for doc in documents:
        yield {
            "_index": fiqa_pl_index_name,
            "_source": {
                "corpus_id": str(doc["_id"]),  # Include the original corpus ID in the source
                "title": doc["title"],
                "text_with_synonyms_with_lemma": doc["text"],
                "text_without_synonyms_with_lemma": doc["text"],
                "text_with_synonyms_no_lemma": doc["text"],
                "text_without_synonyms_no_lemma": doc["text"]
            }
        }

# Perform bulk indexing
bulk_index_documents(df.corpus)

# Refresh the index
es.indices.refresh(index=fiqa_pl_index_name)

print(f"Finished indexing FiQA-PL corpus to {fiqa_pl_index_name}")

Indexed 57638 documents, [] failed
Finished indexing FiQA-PL corpus to fiqa_pl_corpus


In [19]:
qa_test_dataset = qa_pairs_dataset['test']

# info about dataset
# query_ids = qa_dataset['query-id']
# corpus_ids = qa_dataset['corpus-id']
# scores = qa_dataset['score']

# Define setups
setups = [
    ("No synonyms, No lemmatization", "text_without_synonyms_no_lemma"),
    ("Synonyms, No lemmatization", "text_with_synonyms_no_lemma"),
    ("No synonyms, Lemmatization", "text_without_synonyms_with_lemma"),
    ("Synonyms, Lemmatization", "text_with_synonyms_with_lemma")
]


In [20]:
qa_test_df = pd.DataFrame(qa_test_dataset)
queries_df = pd.DataFrame(queries_dataset['queries'])

queries_df

Unnamed: 0,_id,title,text
0,0,,Co jest uważane za wydatek służbowy w podróży ...
1,4,,Wydatki służbowe - ubezpieczenie samochodu pod...
2,5,,Rozpoczęcie nowego biznesu online
3,6,,„Dzień roboczy” i „termin płatności” rachunków
4,7,,Nowy właściciel firmy – Jak działają podatki d...
...,...,...,...
6643,4102,,"Jak mogę ustalić, czy moja stopa zwrotu jest „..."
6644,3566,,"Gdzie mogę kupić akcje, jeśli chcę zainwestowa..."
6645,94,,Wykorzystywanie punktów kart kredytowych do op...
6646,2551,,Jak znaleźć tańszą alternatywę dla tradycyjnej...


In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics import ndcg_score

def calculate_ndcg(qa_dataset, queries_dataset, es, index_name, field):
    ndcg_scores = []
    
    # Convert datasets to DataFrames if they aren't already
    qa_df = pd.DataFrame(qa_dataset) if not isinstance(qa_dataset, pd.DataFrame) else qa_dataset
    queries_df = pd.DataFrame(queries_dataset) if not isinstance(queries_dataset, pd.DataFrame) else queries_dataset
    
    for query_id in qa_df['query-id'].unique():
        # Find the query text, with error handling if not found
        query_row = queries_df[queries_df['_id'] == str(query_id)]
        if query_row.empty:
            print(f"Warning: No query found for query_id {query_id}")
            continue
        
        query = query_row.iloc[0]['text']
        corpus_ids = qa_df[qa_df['query-id'] == query_id]['corpus-id']

        # Parse corpus_ids to a simple set of integers
        corpus_ids_set = set(int(id) for id in corpus_ids)
        
        # Perform the search
        try:
            search_results = es.search(
                index=index_name,
                body={
                    "query": {
                        "match": {
                            field: query
                        }
                    }
                }
            )
        except Exception as e:
            print(f"Error performing search for query_id {query_id}: {e}")
            continue

        # Extract the document IDs from the search results
        retrieved_ids = [int(hit['_source']['corpus_id']) for hit in search_results['hits']['hits']]

        # Create relevance scores for retrieved documents
        relevance_scores = [1 if doc_id in corpus_ids_set else 0 for doc_id in retrieved_ids]

        # Create true relevance scores for all relevant documents
        true_relevance = [1] * len(corpus_ids)

        # Pad both lists to ensure they have exactly 5 elements
        relevance_scores = (relevance_scores + [0] * 5)[:5]
        true_relevance = (true_relevance + [0] * 5)[:5]

        # Calculate NDCG@5
        ndcg = ndcg_score([true_relevance], [relevance_scores], k=5)
        ndcg_scores.append(ndcg)
    
    # Return the average NDCG@5 score
    print('NDCG Sum: ' + str(sum(ndcg_scores)))
    return sum(ndcg_scores) / len(ndcg_scores) if ndcg_scores else 0.0

    

print("NDCG@5 scores:")
for setup_name, field in setups:
    print(f"{setup_name}:")
    avg_ndcg_score = calculate_ndcg(qa_test_df, queries_df, es, fiqa_pl_index_name, field)
    print(f"NDCG@5: {avg_ndcg_score:.4f}")
    print()

NDCG@5 scores:
No synonyms, No lemmatization:
NDCG Sum: 494.8943947704632
NDCG@5: 0.7637

Synonyms, No lemmatization:
NDCG Sum: 494.8943947704632
NDCG@5: 0.7637

No synonyms, Lemmatization:
NDCG Sum: 497.86244389481055
NDCG@5: 0.7683

Synonyms, Lemmatization:
NDCG Sum: 497.86244389481055
NDCG@5: 0.7683



### 1. What are the strengths and weaknesses of regular expressions versus full text search regarding processing of text?

Regular expressions offer precise pattern matching and efficient text manipulation, making them ideal for specific searches in small datasets. However, they lack semantic understanding and struggle with scalability and maintainability for complex patterns.

In contrast, full-text search excels in handling large datasets with advanced indexing and relevance scoring, providing more semantic context and supporting complex queries. Nonetheless, it requires more setup, is resource-intensive, and may sacrifice precision for specific patterns compared to regex.

### 2. Can an LLM be applied in the context of searching for documents? Justify your answer, excluding the obvious observation that an LLM can be used to formulate the answer.

Yes, an LLM can enhance document search beyond simple keyword matching.

LLMs improve document search by understanding context and semantics, handling complex queries, and enabling personalized, relevance-based results. They surpass traditional keyword searches by using deep language comprehension, making searches more accurate and adaptable to different domains.
