In [3]:
from elasticsearch import Elasticsearch
import config as cfg

client = Elasticsearch(
    'https://localhost:9200',
    ssl_assert_fingerprint=cfg.ES_FINGERPRINT,
    basic_auth=('elastic', cfg.ES_PASSWORD)
)
client.info()

ObjectApiResponse({'name': '9ecfe40a2412', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'dgtxIpS2RKy3e_3ZVOGtcQ', 'version': {'number': '8.9.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '8aa461beb06aa0417a231c345a1b8c38fb498a0d', 'build_date': '2023-07-19T14:43:58.555259655Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [4]:
import json
with open('data.json', 'r') as f:
    data = json.load(f)

for book in data:
    print(book)

{'title': 'Data Structures and Algorithms', 'date': '2023-08-02', 'author': 'Emily Johnson'}
{'title': 'Artificial Intelligence Trends', 'date': '2023-08-01', 'author': 'William Smith'}
{'title': 'Frontend Development Techniques', 'date': '2023-07-31', 'author': 'Sophia Lee'}
{'title': 'Database Design Principles', 'date': '2023-07-30', 'author': 'Daniel Wang'}
{'title': 'Networking Fundamentals', 'date': '2023-07-29', 'author': 'Olivia Johnson'}
{'title': 'Mobile App Development', 'date': '2023-07-28', 'author': 'James Davis'}
{'title': 'Cybersecurity Essentials', 'date': '2023-07-27', 'author': 'Emma Smith'}
{'title': 'Cloud Computing Concepts', 'date': '2023-07-26', 'author': 'Noah Johnson'}
{'title': 'User Experience Design', 'date': '2023-07-25', 'author': 'Ava Martin'}
{'title': 'Big Data Analytics', 'date': '2023-07-24', 'author': 'Liam Wilson'}
{'title': 'The Art of Programming', 'date': '2023-08-06', 'author': 'John Smith'}
{'title': 'Learning Python', 'date': '2023-08-05', 'a

In [None]:
book_mappings = {
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "author": {"type": "text"},
            "date": {"type": "date"}
        }
    }
}

client.indices.create(index = "book_index", body=book_mappings)

In [None]:
for each in data:
    client.index(index='book_index', document=each)
client.indices.refresh()

In [5]:
# GET ALL DOCUMENTS
resp = client.search(index='book_index', query={"match_all": {}})
for hit in resp['hits']['hits']:
    print(hit['_source'])

{'title': 'Data Structures and Algorithms', 'date': '2023-08-02', 'author': 'Emily Johnson'}
{'title': 'Artificial Intelligence Trends', 'date': '2023-08-01', 'author': 'William Smith'}
{'title': 'Frontend Development Techniques', 'date': '2023-07-31', 'author': 'Sophia Lee'}
{'title': 'Database Design Principles', 'date': '2023-07-30', 'author': 'Daniel Wang'}
{'title': 'Networking Fundamentals', 'date': '2023-07-29', 'author': 'Olivia Johnson'}
{'title': 'Mobile App Development', 'date': '2023-07-28', 'author': 'James Davis'}
{'title': 'Cybersecurity Essentials', 'date': '2023-07-27', 'author': 'Emma Smith'}
{'title': 'Cloud Computing Concepts', 'date': '2023-07-26', 'author': 'Noah Johnson'}
{'title': 'User Experience Design', 'date': '2023-07-25', 'author': 'Ava Martin'}
{'title': 'Big Data Analytics', 'date': '2023-07-24', 'author': 'Liam Wilson'}


In [7]:
# FILTERING - MATCH
resp = client.search(index='book_index', 
                     query={
                         "match":
                         {"title": "Data"}
                     })
for hit in resp['hits']['hits']:
    print(hit['_score'], hit['_source'])

1.7740581 {'title': 'Big Data Analytics', 'date': '2023-07-24', 'author': 'Liam Wilson'}
1.5585024 {'title': 'Data Structures and Algorithms', 'date': '2023-08-02', 'author': 'Emily Johnson'}


In [None]:
# FILTERING - DATE
resp = client.search(index='book_index', 
                     query={
                         "range": {
                             "date": {
                                 "gte": "2023-08-01"
                             }
                         }
                     })
for hit in resp['hits']['hits']:
    print(hit)

In [8]:
# FILTERING - COMBINE FILTERS
resp = client.search(index='book_index', 
                     query={
                         "bool": {
                             "must": [
                                #  {"match": {"title": "data"}},
                                 {"match": {"author": "Smith"}},
                                 {"range": {"date": {"gte": "2023-08-01"}}}
                             ]
                         }
                     })
for hit in resp['hits']['hits']:
    print(hit)

{'_index': 'book_index', '_id': 'Fx05zIkB-kyXH0nBJ2kx', '_score': 2.4552872, '_source': {'title': 'Artificial Intelligence Trends', 'date': '2023-08-01', 'author': 'William Smith'}}
{'_index': 'book_index', '_id': 'IB05zIkB-kyXH0nBJ2lY', '_score': 2.4552872, '_source': {'title': 'The Art of Programming', 'date': '2023-08-06', 'author': 'John Smith'}}


In [12]:
# FILTERING - COMBINE FILTERS WITHOUT USING ALL OF THEM FOR SCORING
resp = client.search(index='book_index', 
                     query={
                         "bool": {
                             "must": [
                                 {"match": {"title": "data"}},
                                #  {"match": {"author": "Smith"}}   
                             ],
                             "filter": [
                                {"range": {"date": {"gte": "2023-08-01"}}}
                             ]
                         }
                     })
for hit in resp['hits']['hits']:
    print(hit['_score'], hit['_source'])

1.5585024 {'title': 'Data Structures and Algorithms', 'date': '2023-08-02', 'author': 'Emily Johnson'}


In [14]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-mpnet-base-v2')
model

  from .autonotebook import tqdm as notebook_tqdm


SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

In [8]:
len(model.encode(data[0]['title']))

768

In [9]:
model.get_sentence_embedding_dimension()

768

In [22]:
client.indices.delete(index='vector_index')

ObjectApiResponse({'acknowledged': True})

In [23]:
vector_mapping = {
    "mappings": {
        "properties": {
            "title": {"type": "text"},
            "author": {"type": "text"},
            "date": {"type": "date"},
            "vector": {
                "type": "dense_vector",
                "dims": 768,
                "index": True,
                "similarity": "dot_product"
            }
        }
    }
}

client.indices.create(index='vector_index', body= vector_mapping)

  client.indices.create(index='vector_index', body= vector_mapping)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vector_index'})

In [24]:
for each in data:
    each['vector'] = model.encode(each['title'])
    client.index(index='vector_index', document=each)
client.indices.refresh()


ObjectApiResponse({'_shards': {'total': 4, 'successful': 2, 'failed': 0}})

In [25]:
resp = client.search(index='vector_index', query={"match_all": {}})
for hit in resp['hits']['hits']:
    print(resp)

{'took': 9, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 14, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'vector_index', '_id': 'Mh1yzIkB-kyXH0nByWlc', '_score': 1.0, '_source': {'title': 'Data Structures and Algorithms', 'date': '2023-08-02', 'author': 'Emily Johnson', 'vector': [0.006359762046486139, 0.005855908617377281, -0.04932984337210655, 0.017838861793279648, -0.029005052521824837, -0.004800031427294016, -0.00160035805311054, 0.01807008869946003, -0.0026595168747007847, 0.0012216087197884917, 0.051644936203956604, -0.041857071220874786, 0.031746502965688705, 0.014895888976752758, -0.03526131063699722, -0.10488181561231613, 0.029704134911298752, 0.0029771579429507256, -0.030368167906999588, -0.02472872845828533, -0.016214855015277863, -0.029211657121777534, -0.002837033476680517, -0.0751868411898613, 0.01697598770260811, -0.0006547709926962852, -0.02739887312054634, -0.0163294468075037, -0.00928

In [15]:
query_text = "HTML and CSS programming"
query_vector = model.encode(query_text)
query = {
    "field": "vector",
    "query_vector": query_vector,
    "k": 5,
    "num_candidates": 14
}

resp = client.search(index='vector_index', knn=query)
for hit in resp['hits']['hits']:
    print(hit['_score'], hit['_source'])

0.8208817 {'title': 'Web Development 101', 'date': '2023-08-04', 'author': 'Alex Johnson', 'vector': [0.0439428985118866, -0.05732084438204765, -0.036657579243183136, -0.015388929285109043, 0.04545685276389122, 0.02121950313448906, 0.04815695434808731, 0.0013339605648070574, 0.04273226857185364, -0.027676530182361603, 0.031284235417842865, -0.07154859602451324, 0.03964019566774368, 0.04821222648024559, 0.015012616291642189, -0.06563739478588104, 0.015613280236721039, -0.012753644958138466, -0.009149356745183468, -0.003554415889084339, 0.045123759657144547, -0.03628244623541832, -0.02443762496113777, 0.027273990213871002, -0.0329454205930233, -0.018143029883503914, -0.027375854551792145, 0.017553914338350296, -0.03046666644513607, -0.049296583980321884, -0.0018764340784400702, -0.023987486958503723, 0.014381387270987034, -0.024950480088591576, 1.7184463558805874e-06, -0.043120644986629486, -0.010947912000119686, 0.010393188335001469, -0.04225694388151169, 0.03701161593198776, -0.0317325

In [16]:
query = {
    "field": "vector",
    "query_vector": query_vector,
    "k": 5,
    "num_candidates": 14,
    "filter":[
        {"range": {"date": {"gte": "2023-07-01"}}},
        {"match": {"title": "Development"}}
    ]
}
resp = client.search(index='vector_index', knn=query)
for hit in resp['hits']['hits']:
    print(hit['_score'], hit['_source'])

0.8208817 {'title': 'Web Development 101', 'date': '2023-08-04', 'author': 'Alex Johnson', 'vector': [0.0439428985118866, -0.05732084438204765, -0.036657579243183136, -0.015388929285109043, 0.04545685276389122, 0.02121950313448906, 0.04815695434808731, 0.0013339605648070574, 0.04273226857185364, -0.027676530182361603, 0.031284235417842865, -0.07154859602451324, 0.03964019566774368, 0.04821222648024559, 0.015012616291642189, -0.06563739478588104, 0.015613280236721039, -0.012753644958138466, -0.009149356745183468, -0.003554415889084339, 0.045123759657144547, -0.03628244623541832, -0.02443762496113777, 0.027273990213871002, -0.0329454205930233, -0.018143029883503914, -0.027375854551792145, 0.017553914338350296, -0.03046666644513607, -0.049296583980321884, -0.0018764340784400702, -0.023987486958503723, 0.014381387270987034, -0.024950480088591576, 1.7184463558805874e-06, -0.043120644986629486, -0.010947912000119686, 0.010393188335001469, -0.04225694388151169, 0.03701161593198776, -0.0317325