In [8]:
from elasticsearch import Elasticsearch
from config import Config

with open('simple.cfg') as f:
    cfg = Config(f)

print(cfg['ES_FINGERPRINT'])
print(cfg['ES_PASSWORD'])

client = Elasticsearch(
    'https://localhost:9200',
    ssl_assert_fingerprint = cfg['ES_FINGERPRINT'],
    basic_auth=('elastic', cfg['ES_PASSWORD'])
)

client.info()

633bf7f6e4bf264e6a05d488af3c686b858fa63592dc83999a0d77f7e9fe5940
p1k6cT4a4bF+pFYf37Xx


ObjectApiResponse({'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'gO6RyrKVR8S1xvEZcOnbUw', 'version': {'number': '8.9.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '8aa461beb06aa0417a231c345a1b8c38fb498a0d', 'build_date': '2023-07-19T14:43:58.555259655Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [9]:
import json
with open('data.json', 'r') as f:
    data = json.load(f)

for book in data:
    print(book)

{'title': 'Data Structures and Algorithms', 'date': '2023-08-02', 'author': 'Emily Johnson'}
{'title': 'Artificial Intelligence Trends', 'date': '2023-08-01', 'author': 'William Smith'}
{'title': 'Frontend Development Techniques', 'date': '2023-07-31', 'author': 'Sophia Lee'}
{'title': 'Database Design Principles', 'date': '2023-07-30', 'author': 'Daniel Wang'}
{'title': 'Networking Fundamentals', 'date': '2023-07-29', 'author': 'Olivia Johnson'}
{'title': 'Mobile App Development', 'date': '2023-07-28', 'author': 'James Davis'}
{'title': 'Cybersecurity Essentials', 'date': '2023-07-27', 'author': 'Emma Smith'}
{'title': 'Cloud Computing Concepts', 'date': '2023-07-26', 'author': 'Noah Johnson'}
{'title': 'User Experience Design', 'date': '2023-07-25', 'author': 'Ava Martin'}
{'title': 'Big Data Analytics', 'date': '2023-07-24', 'author': 'Liam Wilson'}
{'title': 'The Art of Programming', 'date': '2023-08-06', 'author': 'John Smith'}
{'title': 'Learning Python', 'date': '2023-08-05', 'a

In [13]:
INDEX_NAME = "book_index"
 
if(client.indices.exists(index=INDEX_NAME)):
    print("The index has already existed, going to remove it")
    client.options(ignore_status=404).indices.delete(index=INDEX_NAME)

The index has already existed, going to remove it


In [14]:
book_mappings = {
    "properties": {
        "title": {"type": "text"},
        "author": {"type": "text"},
        "date": {"type": "date"}
    }
}

client.indices.create(index = INDEX_NAME, mappings = book_mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'book_index'})

In [15]:
for each in data:
    client.index(index = INDEX_NAME, document = each)
client.indices.refresh()

ObjectApiResponse({'_shards': {'total': 14, 'successful': 11, 'failed': 0}})

In [16]:
# GET ALL DOCUMENTS
resp = client.search(index='book_index', query={"match_all": {}})
for hit in resp['hits']['hits']:
    print(hit['_source'])

{'title': 'Data Structures and Algorithms', 'date': '2023-08-02', 'author': 'Emily Johnson'}
{'title': 'Artificial Intelligence Trends', 'date': '2023-08-01', 'author': 'William Smith'}
{'title': 'Frontend Development Techniques', 'date': '2023-07-31', 'author': 'Sophia Lee'}
{'title': 'Database Design Principles', 'date': '2023-07-30', 'author': 'Daniel Wang'}
{'title': 'Networking Fundamentals', 'date': '2023-07-29', 'author': 'Olivia Johnson'}
{'title': 'Mobile App Development', 'date': '2023-07-28', 'author': 'James Davis'}
{'title': 'Cybersecurity Essentials', 'date': '2023-07-27', 'author': 'Emma Smith'}
{'title': 'Cloud Computing Concepts', 'date': '2023-07-26', 'author': 'Noah Johnson'}
{'title': 'User Experience Design', 'date': '2023-07-25', 'author': 'Ava Martin'}
{'title': 'Big Data Analytics', 'date': '2023-07-24', 'author': 'Liam Wilson'}


In [17]:
# FILTERING - MATCH
resp = client.search(index='book_index', 
                     query={
                         "match":
                         {"title": "Data"}
                     })
for hit in resp['hits']['hits']:
    print(hit['_score'], hit['_source'])

1.7740581 {'title': 'Big Data Analytics', 'date': '2023-07-24', 'author': 'Liam Wilson'}
1.5585024 {'title': 'Data Structures and Algorithms', 'date': '2023-08-02', 'author': 'Emily Johnson'}


In [None]:
# FILTERING - DATE
resp = client.search(index='book_index', 
                     query={
                         "range": {
                             "date": {
                                 "gte": "2023-08-01"
                             }
                         }
                     })
for hit in resp['hits']['hits']:
    print(hit)

In [18]:
# FILTERING - COMBINE FILTERS
resp = client.search(index='book_index', 
                     query={
                         "bool": {
                             "must": [
                                #  {"match": {"title": "data"}},
                                 {"match": {"author": "Smith"}},
                                 {"range": {"date": {"gte": "2023-08-01"}}}
                             ]
                         }
                     })
for hit in resp['hits']['hits']:
    print(hit)

{'_index': 'book_index', '_id': 'QqxjAYoB1nRdAs0LCh_5', '_score': 2.4552872, '_source': {'title': 'Artificial Intelligence Trends', 'date': '2023-08-01', 'author': 'William Smith'}}
{'_index': 'book_index', '_id': 'S6xjAYoB1nRdAs0LCx-F', '_score': 2.4552872, '_source': {'title': 'The Art of Programming', 'date': '2023-08-06', 'author': 'John Smith'}}


In [12]:
# FILTERING - COMBINE FILTERS WITHOUT USING ALL OF THEM FOR SCORING
resp = client.search(index='book_index', 
                     query={
                         "bool": {
                             "must": [
                                 {"match": {"title": "data"}},
                                #  {"match": {"author": "Smith"}}   
                             ],
                             "filter": [
                                {"range": {"date": {"gte": "2023-08-01"}}}
                             ]
                         }
                     })
for hit in resp['hits']['hits']:
    print(hit['_score'], hit['_source'])

1.5585024 {'title': 'Data Structures and Algorithms', 'date': '2023-08-02', 'author': 'Emily Johnson'}


In [20]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')
model

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [21]:
len(model.encode(data[0]['title']))

768

In [22]:
model.get_sentence_embedding_dimension()

768

In [27]:
INDEX_NAME_VECTOR = "vector_index"
if(client.indices.exists(index = INDEX_NAME_VECTOR)):
    print("The index has already existed, going to remove it")
    client.options(ignore_status=404).indices.delete(index = INDEX_NAME_VECTOR)

In [29]:
vector_mapping = {
    "properties": {
        "title": {"type": "text"},
        "author": {"type": "text"},
        "date": {"type": "date"},
        "vector": {
            "type": "dense_vector",
            "dims": 768,
            "index": True,
            "similarity": "dot_product"
        }
    }
}

client.indices.create(index = INDEX_NAME_VECTOR, mappings = vector_mapping)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vector_index'})

In [30]:
for each in data:
    each['vector'] = model.encode(each['title'])
    client.index(index = INDEX_NAME_VECTOR, document=each)
client.indices.refresh()


ObjectApiResponse({'_shards': {'total': 16, 'successful': 12, 'failed': 0}})

In [33]:
resp = client.search(index = INDEX_NAME_VECTOR, query={"match_all": {}})
for hit in resp['hits']['hits']:
    print(resp)

{'took': 3, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 14, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'vector_index', '_id': '7Kz-AYoB1nRdAs0LhT0S', '_score': 1.0, '_source': {'title': 'Data Structures and Algorithms', 'date': '2023-08-02', 'author': 'Emily Johnson', 'vector': [0.006359683349728584, 0.005855909548699856, -0.049329839646816254, 0.017838846892118454, -0.02900504134595394, -0.004800017457455397, -0.0016003003111109138, 0.018070099875330925, -0.0026594216469675303, 0.0012216350296512246, 0.05164498835802078, -0.04185698181390762, 0.03174648806452751, 0.014895898289978504, -0.03526131808757782, -0.10488168895244598, 0.029704051092267036, 0.0029772359412163496, -0.030368177220225334, -0.024728696793317795, -0.016214793547987938, -0.02921168878674507, -0.002836989238858223, -0.0751868262887001, 0.016975969076156616, -0.0006547234952449799, -0.027398869395256042, -0.016329461708664894, -0.0

In [48]:
query_text = "HTML and CSS programming"
query_vector = model.encode(query_text)
query = {
    "field": "vector",
    "query_vector": query_vector,
    "k": 5,
    "num_candidates": 14
}

resp = client.search(index='vector_index', knn=query, source=False, fields=['title'])
for hit in resp['hits']['hits']:
    print(hit['_score'], hit['fields'])

0.82088166 {'title': ['Web Development 101']}
0.71641463 {'title': ['The Art of Programming']}
0.7152783 {'title': ['Frontend Development Techniques']}
0.6553724 {'title': ['Mobile App Development']}
0.6533262 {'title': ['User Experience Design']}


In [50]:
query = {
    "field": "vector",
    "query_vector": query_vector,
    "k": 5,
    "num_candidates": 14,
    "filter":[
        {"range": {"date": {"gte": "2023-07-01"}}},
        {"match": {"title": "Development"}}
    ]
}
resp = client.search(index='vector_index', knn=query, source=False, fields=['title'])
for hit in resp['hits']['hits']:
    print(hit['_score'], hit['fields'])

0.82088166 {'title': ['Web Development 101']}
0.7152783 {'title': ['Frontend Development Techniques']}
0.6553724 {'title': ['Mobile App Development']}
