In [3]:
# Prepare documents

import requests

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
# Create Embeddings using Pretrained Models

# Sentence Transformers documentation here: https://www.sbert.net/docs/sentence_transformer/pretrained_models.html

# !pip install sentence_transformers==2.7.0

from sentence_transformers import SentenceTransformer

# if you get an error do the following:
# 1. Uninstall numpy 
# 2. Uninstall torch
# 3. pip install numpy==1.26.4
# 4. pip install torch
# run the above cell, it should work

model = SentenceTransformer("all-mpnet-base-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
len(model.encode("This is a simple sentence"))  # the embeddings produced by the model have 768 dimensions, which are used for representing the semantic properties of the input text.

768

In [6]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [18]:
#created the dense vector using the pre-trained model

operations = []

for doc in documents:
    # Transforming the title into an embedding using the model
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

operations[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'text_vector': [-0.041030403226614,
  0.025834161788225174,
  -0.036801841109991074,
  -0.020898321643471718,
  -0.020596304908394814,
  0.009353742003440857,
  -0.003331671468913555,
  -0.009491903707385063,
  0.030117977410554886,
  0.01908210851252079,
  0.012690035626292229,
  -0.017078785225749016,
  -0.0016324761090800166,
  0.12997251749038696,
  0.030969230458140373,
  -0.025823738425970078,
  0.0278230682015419,
  0.025159770622849464,
  -0.0808122381567955,
  -0.0036173474509269,
  -0.008902025409042835,
  0.003404824063181877,
  -0.0230092890560627,
  -0.03404529020190239,
  0.024598615244030952,
  0.013545555993914604,
  -0.025439025834202766,
  0.011951087042689323,
  -0.020540112629532814,
  -0.010077380575239658,
  0.020575348287820816,
  0.0

In [22]:
len(operations[1]["text_vector"])

768

In [23]:
# Setup ElasticSearch connection

from elasticsearch import Elasticsearch
es_client = Elasticsearch("http://localhost:9200")

es_client.info()

ObjectApiResponse({'name': 'aef735d4ad20', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'naI-3jzwSCKfSYcfTktpiw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [24]:
#  Create Mappings and Index



# Mapping is the process of defining how a document, and the fields it contains, are stored and indexed.

# Each document is a collection of fields, which each have their own data type.

# We can compare mapping to a database schema in how it describes the fields and properties that documents hold, the datatype of each field (e.g., string, integer, or date), and how those fields should be indexed and stored

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

In [25]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)  # to avoid getting errors due to the same index name when updating

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [26]:
# Add documents into index

for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [27]:
# Create end user query

search_term = "I just discovered the course. Can I still join it?"
vector_search_term = model.encode(search_term)

In [28]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000, 
}

In [29]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])
res["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'F8NM0ZEBtKQMBbFBr7P_',
  '_score': 0.75863546,
  '_source': {'question': 'The course has already started. Can I still join it?',
   'course': 'machine-learning-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.'}},
 {'_index': 'course-questions',
  '_id': 'GMNM0ZEBtKQMBbFBsLMQ',
  '_score': 0.75591207,
  '_source': {'question': 'When does the next iteration start?',
   'course': 'machine-learning-zoomcamp',
   'section': 'General course-related questions',
   'text': 'The course is available in the self-paced mode too, so you can go through the materials a

In [30]:
# Perform Keyword search with Semantic Search (Hybrid/Advanced Search)
 
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [31]:
response = es_client.search(
    index=index_name,
    query={
        "match": {"section": "General course-related questions"},
    },
    knn=knn_query,
    size=5,
   # explain = True,  how the scores are calculated
)

In [32]:
response["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'F8NM0ZEBtKQMBbFBr7P_',
  '_score': 11.658556,
  '_source': {'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
   'section': 'General course-related questions',
   'question': 'The course has already started. Can I still join it?',
   'course': 'machine-learning-zoomcamp',
   'text_vector': [-0.05821841210126877,
    0.016635717824101448,
    -0.01545377541333437,
    -0.0009685716941021383,
    0.014512795954942703,
    -0.006483578123152256,
    0.004492288921028376,
    0.00276132021099329,
    -0.021292895078659058,
    -0.030807888135313988,
    0.058647431433200836,
    -0.008818399161100388,
    -0.03633170202