In [1]:
import json

with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [2]:

documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

documents[1]

{'text': 'See DE zoomcamp 2025 pre-course Q&A\nTo get the most out of this course, you should have:\nBasic coding experience\nFamiliarity with SQL\nExperience with Python (helpful but not required)\nNo prior data engineering experience is necessary. See Read03/11/2025me on GitHub',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
from sentence_transformers import SentenceTransformer

  from scipy.sparse import csr_matrix, issparse


In [4]:
model = SentenceTransformer("all-mpnet-base-v2")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [5]:
len(model.encode("This is a simple sentence"))

768

In [6]:
documents[1]

{'text': 'See DE zoomcamp 2025 pre-course Q&A\nTo get the most out of this course, you should have:\nBasic coding experience\nFamiliarity with SQL\nExperience with Python (helpful but not required)\nNo prior data engineering experience is necessary. See Read03/11/2025me on GitHub',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
#created the dense vector using the pre-trained model
operations = []
for doc in documents:
    # Transforming the title into an embedding using the model
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

In [8]:
documents[1]

{'text': 'See DE zoomcamp 2025 pre-course Q&A\nTo get the most out of this course, you should have:\nBasic coding experience\nFamiliarity with SQL\nExperience with Python (helpful but not required)\nNo prior data engineering experience is necessary. See Read03/11/2025me on GitHub',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'text_vector': [-0.01846477761864662,
  -0.002196308458223939,
  -0.022099856287240982,
  -0.01199972815811634,
  0.012895666062831879,
  0.034746572375297546,
  -0.05157475918531418,
  -0.022820008918642998,
  0.04952111840248108,
  0.010289222002029419,
  0.013640676625072956,
  -0.023182906210422516,
  -0.011620007455348969,
  0.11907073855400085,
  0.06812675297260284,
  -0.0928049385547638,
  0.016347404569387436,
  0.017330443486571312,
  -0.10497976839542389,
  -0.02250160090625286,
  -0.014842456206679344,
  -0.004285082686692476,
  -0.056616164

In [9]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200') 

es_client.info()

ObjectApiResponse({'name': 'e5170dd72ce5', 'cluster_name': 'docker-cluster', 'cluster_uuid': 't34fH80dSBy7B9QIFo7_Iw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [10]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} ,
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
        }
    }
}

In [11]:
index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [12]:
for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

In [13]:
search_term = "windows or mac?"
vector_search_term = model.encode(search_term)

In [14]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000, 
}

In [15]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])
res["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'LxBQvZUBMf6Mj1ANZaMU',
  '_score': 0.7061926,
  '_source': {'question': 'Environment - Is the course [Windows/macOS/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'Course Management Platform for Homeworks, Project and Certificate',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students in the 2024 cohort used all 3 OSes successfully.'}},
 {'_index': 'course-questions',
  '_id': 'lRBRvZUBMf6Mj1ANSaag',
  '_score': 0.61072767,
  '_source': {'question': 'WSL instructions',
   'course': 'mlops-zoomcamp',
   'section': 'Module 1: Introduction',
   'text': 'If you wish to use WSL on your windows machine, here are the setup instructions:\nCommand: Sudo apt install wget\nGet Anaconda download address here. wget <download address>\nTurn on Docker Desktop WFree Download | AnacondaSL2\nCommand: git clone <github repository address>\nVSCODE on WSL\nJupyter: pip3 install jupyter\nAdded by Gregory Morri

In [16]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [17]:
response = es_client.search(
    index=index_name,
    query={
        "match": {"section": "General course-related questions"},
    },
    knn=knn_query,
    size=5
)

In [18]:
response["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'CBBQvZUBMf6Mj1ANWqPL',
  '_score': 13.001286,
  '_source': {'text': "Data Engineering Zoomcamp FAQ\nMar 20, 2025-compopswork\nData Engineering Zoomcamp FAQ\nThe purpose of this document is to capture Frequently asked technical questions\ntexting, mailing\nStart to cry and for help\nCompoase the output code wghich  will be abdbleto take date analyse  sort   and give me the output, twa\nSdy for ext then grqphre\nAnd ready for reuse\nEditing guidelines:\nWhen adding a new FAQ entry, make sure the question is “Heading 2”\nFeel free to improve if you see something is off\nDon’t change the formatting in the Data document or add any visual “improvements” (make a copy for yourself first if you need to do it for whatever reason)\nDon’t change the pages format (it should be “pageless”)\nAdd name and date for reference, if possible\nThe next cohort starts January 13th 2025. More info at DTC.\nRegister before the course starts using this link.\nJoint the c