### Text Search

In [3]:
# Install Elasticsearch client
# ! pip3 install elasticsearch

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m[33m
[0mCollecting elasticsearch
  Downloading elasticsearch-8.15.1-py3-none-any.whl.metadata (8.7 kB)
Collecting elastic-transport<9,>=8.13 (from elasticsearch)
  Downloading elastic_transport-8.15.1-py3-none-any.whl.metadata (3.7 kB)
Downloading elasticsearch-8.15.1-py3-none-any.whl (524 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.6/524.6 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading elastic_transport-8.15.1-py3-none-any.whl (64 kB)
Installing collected packages: elastic-transport, elasticsearch
[33m  DEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or 

In [8]:
from elasticsearch import Elasticsearch

# create client obj
es_client = Elasticsearch(
    "http://localhost:9200",
    basic_auth=("elastic", "password"),
    verify_certs=False,
    ssl_show_warn=False
)

In [16]:
# Elasticsearch 매핑 타입
#
# text: fields are analyzed and tokenized, ideal for full-text search.
# keyword: fields are not analyzed and are used for exact matches, aggregations, and sorting.
# date: date and time.
# long: Numeric value.
# boolean: True or False.
# nested: Nested objects or arrays.

# tokenizer: splits text into individual terms or tokens.
# analyzer: tokenizer + filter. Built-in analyzers: 'standard', 'simple', 'whitespace', etc.

# index test data
index = 'test'

mapping = {
    "settings": {
        "analysis": {
            "analyzer": {
                "custom_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": ["lowercase"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "content": {
                "type": "text",
                "analyzer": "custom_analyzer"
            }
        }
    }
}

es_client.indices.create(index=index, body=mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'test'})

In [17]:
current_mapping = es_client.indices.get_mapping(index=index)
print(current_mapping)

{'test': {'mappings': {'properties': {'content': {'type': 'text', 'analyzer': 'custom_analyzer'}, 'title': {'type': 'text'}}}}}


In [24]:
# Analyzer example

es_client.indices.analyze(
    body={
        "tokenizer": "standard",
        "filter" : ["lowercase"],
        "text" : 'I like Apple'
    }
)

ObjectApiResponse({'tokens': [{'token': 'i', 'start_offset': 0, 'end_offset': 1, 'type': '<ALPHANUM>', 'position': 0}, {'token': 'like', 'start_offset': 2, 'end_offset': 6, 'type': '<ALPHANUM>', 'position': 1}, {'token': 'apple', 'start_offset': 7, 'end_offset': 12, 'type': '<ALPHANUM>', 'position': 2}]})

In [18]:
from elasticsearch.helpers import bulk

documents = [
    {
        '_index': index,
        '_id': 1,
        '_source': {
            'title': 'Opinon',
            'content': 'I like Apple'
        }
    }
    # more document...
]

success, _ = bulk(es_client, documents)
print(f"Successfully indexed {success} documents")

Successfully indexed 1 documents


ObjectApiResponse({'tokens': [{'token': 'I', 'start_offset': 0, 'end_offset': 1, 'type': '<ALPHANUM>', 'position': 0}, {'token': 'like', 'start_offset': 2, 'end_offset': 6, 'type': '<ALPHANUM>', 'position': 1}, {'token': 'Apple', 'start_offset': 7, 'end_offset': 12, 'type': '<ALPHANUM>', 'position': 2}]})

In [21]:
# Prefix matching

query = {
	'query': {
		'prefix': {
			'content': 'appl'
		}
	}
}

result = es_client.search(index=index, body=query)
result['hits']

{'total': {'value': 1, 'relation': 'eq'},
 'max_score': 1.0,
 'hits': [{'_index': 'test',
   '_id': '1',
   '_score': 1.0,
   '_source': {'title': 'Opinon', 'content': 'I like Apple'}}]}

In [19]:
# Exact matching

query = {
	'query': {
		'match': {
			'content': 'apple'
		}
	}
}

result = es_client.search(index=index, body=query)
result['hits']

{'total': {'value': 1, 'relation': 'eq'},
 'max_score': 0.2876821,
 'hits': [{'_index': 'test',
   '_id': '1',
   '_score': 0.2876821,
   '_source': {'title': 'Opinon', 'content': 'I like Apple'}}]}

In [20]:
# Exact matching fails

query = {
	'query': {
		'match': {
			'content': 'appl'
		}
	}
}

result = es_client.search(index=index, body=query)
result['hits']

{'total': {'value': 0, 'relation': 'eq'}, 'max_score': None, 'hits': []}

In [39]:
# Phrase matching

query = {
  'query': {
    'match_phrase': {
      'content': 'i like'
    }
  }
}

result = es_client.search(index=index, body=query)
result['hits']

{'total': {'value': 1, 'relation': 'eq'},
 'max_score': 0.5753642,
 'hits': [{'_index': 'test',
   '_id': '1',
   '_score': 0.5753642,
   '_source': {'title': 'Opinon', 'content': 'I like Apple'}}]}

In [31]:
# Fuzzy matching

query = {
  'query': {
    'fuzzy': {
      'content': {
        'value': 'aapl',
        'fuzziness': 2 # max edit distance 2
      }
    }
  }
}

result = es_client.search(index=index, body=query)
result['hits']

{'total': {'value': 1, 'relation': 'eq'},
 'max_score': 0.14384104,
 'hits': [{'_index': 'test',
   '_id': '1',
   '_score': 0.14384104,
   '_source': {'title': 'Opinon', 'content': 'I like Apple'}}]}

### Query Language

In [37]:
query = {
  "query": {
    "query_string": {
      "query": "(title:opinon) AND (content:apple) OR (content:orange)"
    }
  }
}

result = es_client.search(index=index, body=query)
result['hits']

{'total': {'value': 1, 'relation': 'eq'},
 'max_score': 0.5753642,
 'hits': [{'_index': 'test',
   '_id': '1',
   '_score': 0.5753642,
   '_source': {'title': 'Opinon', 'content': 'I like Apple'}}]}

### Search Algorithm

In [88]:
mock_documents = [
    {'page': '1', 'content': 'Quest'},
    {'page': '2', 'content': 'Quest P'},
    {'page': '3', 'content': 'Quest Pro'},
    {'page': '4', 'content': 'Quest Pro 2'},
    {'page': '5', 'content': 'Quest Quest Pro Pro'},
    {'page': '6', 'content': 'Quest Quest Quest Pro Pro Pro'},
]

In [45]:
from math import log

# Compute with TF-IDF score

# Measures how often a term appears in a document
# The more frequently a term appears, the higher its contribution to the relevance score
def term_frequency(word, document):
    return document.count(word) / len(document)
tf = term_frequency

# Measures the importance of a term across all documents
# Terms that appear in many documents have lower IDF values, reducing their impact on the relevance score
def inverse_document_frequency(word, corpus):
    count_of_documents = len(corpus) + 1
    count_of_documents_with_word = sum([1 for doc in corpus if word in doc]) + 1
    idf = log(count_of_documents/count_of_documents_with_word, 10) + 1
    return idf
idf = inverse_document_frequency

def TF_IDF(word, document, corpus):
    return tf(word, document) * idf(word, corpus)


scored_documents = mock_documents.copy()
corpus = [document['content'].lower().split() for document in documents]


word = 'quest'

for i, document in enumerate(corpus):
    tf_score = tf(word, document)
    idf_score = idf(word, corpus)
    tf_idf_score = TF_IDF(word, document, corpus)    
    print("document %s: '%s'\n    tf score: %s\n    idf score: %s\n    tf_idf score:%s"%(i, document, tf_score, idf_score, tf_idf_score))

document 0: '['quest']'
    tf score: 1.0
    idf score: 1.0
    tf_idf score:1.0
document 1: '['quest', 'p']'
    tf score: 0.5
    idf score: 1.0
    tf_idf score:0.5
document 2: '['quest', 'pro']'
    tf score: 0.5
    idf score: 1.0
    tf_idf score:0.5
document 3: '['quest', 'pro', '2']'
    tf score: 0.3333333333333333
    idf score: 1.0
    tf_idf score:0.3333333333333333
document 4: '['quest', 'quest', 'pro', 'pro']'
    tf score: 0.5
    idf score: 1.0
    tf_idf score:0.5
document 5: '['quest', 'quest', 'quest', 'pro', 'pro', 'pro']'
    tf score: 0.5
    idf score: 1.0
    tf_idf score:0.5


In [47]:
# Index with BM25 score
# BM25 scores documents based on their contents

# TF-IDF
# TF = document.count(word) / len(document)
# IDF = log10(count_of_documents/count_of_documents_with_word) + 1

# BM25
# BM25_TF = ( TF * (k1 + 1) ) / ( TF + (k1 * (1 - b + b * (document_length / avg_document_length)) ) )
# BM25_IDF = ln(1 + (count_of_documents - count_of_documents_with_word + 0.5) / (count_of_documents_with_word + 0.5))
# k1: limits how much a single query term can affect the score of a given document
# b: multiplier for (document_length / avg_document_length)


# Final score is TF(q) * IDF(q) for each query term

index = 'tf-idf-test'
mapping = {
    "settings": {
        "number_of_shards": 1,
        "index" : {
            "similarity" : {
              "default" : {
                "type" : "BM25",
                "b": 0.75, # default value
                "k1": 1
              }
            }
        }
    },
    "mappings": {
        "properties": {
            "page": {"type": "text"}
            "content": {"type": "text"}
        }
    }
}

es_client.indices.create(index=index, body=mapping)

documents = [ {"_index": index, "_id": i+1, "_source": document } for i, document in enumerate(mock_documents) ]

success, _ = bulk(es_client, documents)
print(f"Successfully indexed {success} documents")

Successfully indexed 6 documents


In [48]:
query = {
	'query': {
		'match': {
			'content': 'quest'
		}
	}
}

result = es_client.search(index=index, body=query)
result['hits']

{'total': {'value': 6, 'relation': 'eq'},
 'max_score': 0.09881063,
 'hits': [{'_index': 'tf-idf-test',
   '_id': '1',
   '_score': 0.09881063,
   '_source': {'content': 'Quest'}},
  {'_index': 'tf-idf-test',
   '_id': '6',
   '_score': 0.09361008,
   '_source': {'content': 'Quest Quest Quest Pro Pro Pro'}},
  {'_index': 'tf-idf-test',
   '_id': '5',
   '_score': 0.091209814,
   '_source': {'content': 'Quest Quest Pro Pro'}},
  {'_index': 'tf-idf-test',
   '_id': '2',
   '_score': 0.08469483,
   '_source': {'content': 'Quest P'}},
  {'_index': 'tf-idf-test',
   '_id': '3',
   '_score': 0.08469483,
   '_source': {'content': 'Quest Pro'}},
  {'_index': 'tf-idf-test',
   '_id': '4',
   '_score': 0.074107975,
   '_source': {'content': 'Quest Pro 2'}}]}

In [68]:
# Compute with PageRank score

# PageRank formula
# Specify the in-neighbors of the node, which is all of its parents
# Sum up the proportional rank from all of its in-neighbors
# Calculate the probability of randomly walking out the links with damping factor d
# Update the PageRank with the sum of proportional rank and random walk

# Damping factor: Probability of following a link (typically set to 0.85).

def page_rank(links, damping_factor=0.85, max_iterations=100, convergence_tol=1e-6):
    num_pages = len(links)
    # initial scores
    pagerank_scores = {page: 1.0 / num_pages for page in links}

    for _ in range(max_iterations):
        new_pagerank_scores = {}
        total_residual = 0

        # iterate over each page
        for page in links:
            # calculate the contribution from incoming links
            incoming_contribution = sum(pagerank_scores[in_link] / len(links[in_link])
                                        for in_link in links if page in links[in_link])

            # update the PageRank score for the current page
            new_pagerank_scores[page] = (1 - damping_factor) / num_pages + damping_factor * incoming_contribution

            # Calculate the residual for convergence checking
            total_residual += abs(new_pagerank_scores[page] - pagerank_scores[page])

        pagerank_scores = new_pagerank_scores

        # stop if convergence hit
        if total_residual < convergence_tol:
            break

    return pagerank_scores
  
# create document relationship
graph = {
    '1': ['2', '3'],
    '2': ['1'],
    '3': ['1', '2'],
    '4': ['1'],
    '5': ['1', '6'],
    '6': ['1', '4']
}

final_scores = page_rank(graph)

for page, score in final_scores.items():
    print(f'{page}: {score}')

# copy PageRank score to documents
scored_documents = mock_documents.copy()
for i in range(len(scored_documents)):
    page_id = scored_documents[i]['page']
    scored_documents[i]['pagerank'] = final_scores[page_id] if page_id in final_scores else 0

print(scored_documents)

1: 0.41298080309368673
2: 0.28573657349462706
3: 0.2005169984116864
4: 0.040140625000000006
5: 0.025000000000000005
6: 0.035625000000000004
[{'page': '1', 'content': 'Quest', 'pagerank': 0.41298080309368673}, {'page': '2', 'content': 'Quest P', 'pagerank': 0.28573657349462706}, {'page': '3', 'content': 'Quest Pro', 'pagerank': 0.2005169984116864}, {'page': '4', 'content': 'Quest Pro 2', 'pagerank': 0.040140625000000006}, {'page': '5', 'content': 'Quest Quest Pro Pro', 'pagerank': 0.025000000000000005}, {'page': '6', 'content': 'Quest Quest Quest Pro Pro Pro', 'pagerank': 0.035625000000000004}]


In [71]:
# Index with PageRank score
# PageRank scores documents based on their sources.

index = 'pagerank-test'
mapping = {
    "mappings": {
        "properties": {
            "page": {"type": "text"},
            "content": {"type": "text"},
            "pagerank": {"type": "rank_feature"},
        }
    }
}

es_client.indices.create(index=index, body=mapping)

documents = [ {"_index": index, "_id": i+1, "_source": document } for i, document in enumerate(scored_documents) ]

success, _ = bulk(es_client, documents)
print(f"Successfully indexed {success} documents")

Successfully indexed 6 documents


In [80]:
query = {
    'query': {
        'rank_feature': {
          'field': 'pagerank'
        }
    }
}

result = es_client.search(index=index, body=query)
result['hits']

{'total': {'value': 6, 'relation': 'eq'},
 'max_score': 0.8080421,
 'hits': [{'_index': 'pagerank-test',
   '_id': '1',
   '_score': 0.8080421,
   '_source': {'page': '1',
    'content': 'Quest',
    'pagerank': 0.41298080309368673}},
  {'_index': 'pagerank-test',
   '_id': '2',
   '_score': 0.7444232,
   '_source': {'page': '2',
    'content': 'Quest P',
    'pagerank': 0.28573657349462706}},
  {'_index': 'pagerank-test',
   '_id': '3',
   '_score': 0.6715807,
   '_source': {'page': '3',
    'content': 'Quest Pro',
    'pagerank': 0.2005169984116864}},
  {'_index': 'pagerank-test',
   '_id': '4',
   '_score': 0.2902655,
   '_source': {'page': '4',
    'content': 'Quest Pro 2',
    'pagerank': 0.040140625000000006}},
  {'_index': 'pagerank-test',
   '_id': '6',
   '_score': 0.2662397,
   '_source': {'page': '6',
    'content': 'Quest Quest Quest Pro Pro Pro',
    'pagerank': 0.035625000000000004}},
  {'_index': 'pagerank-test',
   '_id': '5',
   '_score': 0.20317936,
   '_source': {'pa

### Vector Search

In [82]:
# Use Ollama to download and use embeddings locally
# Ollama is a Docker for AI models
# https://ollama.com/library/mxbai-embed-large
# Embedding Model Benchmark: https://huggingface.co/spaces/mteb/leaderboard

# ! pip3 install ollama
# ! ollama pull mxbai-embed-large

[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest 
pulling 819c2adf5ce6...   0% ▕                ▏    0 B/669 MB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 819c2adf5ce6...   0% ▕                ▏ 1.4 MB/669 MB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 819c2adf5ce6...   1% ▕                ▏ 5.4 MB/669 MB                  [?25h[?25l[2K[1G[A[2K[1Gpulling manifest 
pulling 819c2adf5ce6...   1% ▕                ▏ 6.5 MB/669 MB                  [?25h

In [89]:
import ollama

query = 'quest'
res = ollama.embeddings(model='mxbai-embed-large', prompt=query)
print(f"Embedding dimension: {len(res['embedding'])}")

# convert to vectors
vectors = []
for document in mock_documents:
    vector = { 'page': document['page'] }
    content = document['content'].lower()
    result = ollama.embeddings(model='mxbai-embed-large', prompt=content)
    vector['content_vector'] = result['embedding']
    vectors.append(vector)

Embedding dimension: 1024


In [90]:
index = 'vector-test'

# tuning vector search
# https://www.elastic.co/guide/en/elasticsearch/reference/current/tune-knn-search.html
mapping = {
    "mappings": {
        "properties": {
            "page": {"type": "text"},
            "content_vector": {
                "type": "dense_vector",
                "dims": 1024,
                "index": True,
                "similarity": "cosine",
                "index_options": {
                  "type": "int8_hnsw" # reduce memory requirement
                }
            },
        }
    }
}

# es_client.indices.create(index=index, body=mapping)

documents = [ {"_index": index, "_id": i+1, "_source": vector } for i, vector in enumerate(vectors) ]

success, _ = bulk(es_client, documents)
print(f"Successfully indexed {success} documents")

Successfully indexed 6 documents


In [91]:
# compute query vector
q = 'quest'
q_vector = ollama.embeddings(model='mxbai-embed-large', prompt=q)

# approximate kNN search
# Elasticsearch use Hierarchical Navigable Small World (HNSW) internally
# 

# reduce vector dimension and memory footprint
# aviod heavy indexing during query time

query = {
  "_source": False,
  "fields": [ "page" ],
  "knn": {
    "field": "content_vector",
    "query_vector": q_vector['embedding'],
    "k": 2,                           # 2 nearest neighbor
    "num_candidates": 6               # consider 6 candidates in each shard
  }
}

result = es_client.search(index=index, body=query)
result['hits']

{'total': {'value': 2, 'relation': 'eq'},
 'max_score': 0.999681,
 'hits': [{'_index': 'vector-test',
   '_id': '1',
   '_score': 0.999681,
   'fields': {'page': ['1']}},
  {'_index': 'vector-test',
   '_id': '2',
   '_score': 0.9390633,
   'fields': {'page': ['2']}}]}