#### Step 1 

In [4]:
import json 

In [2]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = []

for course_dict in docs_raw:
    # course: de zoomcamp -> q/a list 
    # Noramlize the course dict as list of doc
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [11]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

#### Step 2 

In [6]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting transformers<5.0.0,>=4.34.0 (from sentence-transformers)
  Downloading transformers-4.42.4-py3-none-any.whl.metadata (43 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.3.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting huggingface-hub>=0.15.1 (from sentence-transformers)
  Downloading huggingface_hub-0.24.0-py3-none-any.whl.metadata (13 kB)
Collecting Pillow (from sentence-transformers)
  Downloading pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Collecting filelock (from huggingface-hub>=0.15.1->sentence-transformers)
  Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec>=2023.5.0 (from huggingface-hub>=0.15.1->sentence-transformers)
  Do

In [8]:
from sentence_transformers import SentenceTransformer 

model = SentenceTransformer("all-MiniLM-L6-v2")


In [9]:

# The sentences to encode
sentences = [
    "The weather is lovely today.",
    "It's so sunny outside!",
    "He drove to the stadium.",
]

# 2. Calculate embeddings by calling model.encode()
embeddings = model.encode(sentences)
print(embeddings.shape)
# [3, 384]


(3, 384)


In [10]:
embeddings

array([[ 0.01919578,  0.1200854 ,  0.1595983 , ..., -0.00536285,
        -0.08109501,  0.05021339],
       [-0.0186904 ,  0.04151868,  0.07431547, ...,  0.00486597,
        -0.06190437,  0.03187514],
       [ 0.13650201,  0.08227322, -0.02526164, ...,  0.08762042,
         0.03045843, -0.01075752]], dtype=float32)

In [12]:
from tqdm.auto import tqdm
operations = []
for doc in tqdm(documents):
    doc['text_vector'] = model.encode(doc['text']).tolist()
    operations.append(doc)

100%|███████████████████████████████████████████████████████████████| 948/948 [00:45<00:00, 20.73it/s]


In [13]:
operations[0]


{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'text_vector': [-0.056110020726919174,
  -0.002078067744150758,
  0.03237273916602135,
  -0.006383475847542286,
  -0.029120661318302155,
  -0.03931581601500511,
  -0.08001381158828735,
  -0.024460505694150925,
  -0.0665355697274208,
  -0.0018096420681104064,
  -0.03212626278400421,
  -0.008774392306804657,
  -0.044479601085186005,
  -0.03446931391954422,
  0.026010317727923393,


#### Step 3 

In [31]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 
es_client.info()


ObjectApiResponse({'name': 'bafc79caf2bb', 'cluster_name': 'docker-cluster', 'cluster_uuid': '8nQ615qaQLesDqWPp4bgaA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

#### Step 4

In [37]:
index_setting = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector":{"type": "dense_vector", "dims": 384, "index": True, "similarity": "cosine"}
        }
    }
}

index_name = 'course-questions'

es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_setting)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

#### Step 5

In [38]:
from tqdm.auto import tqdm
for doc in tqdm(operations):
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)


100%|███████████████████████████████████████████████████████████████| 948/948 [00:24<00:00, 38.95it/s]


#### Step 6 

In [39]:
search_term = 'windows or mac?'
vector_search_term = model.encode(search_term)

In [40]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000
}

In [41]:
res = es_client.search(
    index=index_name,
    knn=query,
    source=['text', 'section', 'question', 'course']
)

res['hits']['hits']

[{'_index': 'course-questions',
  '_id': 'pJaT3ZABBlcff5rxxxg7',
  '_score': 0.7125963,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}},
 {'_index': 'course-questions',
  '_id': 'P5aT3ZABBlcff5rx8Bob',
  '_score': 0.6609763,
  '_source': {'question': "Any particular hardware requirements for the course or everything is mostly cloud? TIA! Couldn't really find this in the FAQ.",
   'course': 'machine-learning-zoomcamp',
   'section': 'General course-related questions',
   'text': 'For the Machine Learning part, all you need is a working laptop with an internet connection. The Deep Learning part is more resource intensive, but for that you can use a cloud (we use Saturn cloud but can be anything else).\n(Rileen Sinha; based on response b

#### Step 6

In [44]:
response = es_client.search(
    index=index_name,
    query = {
        'bool': {
            'must': {
                'multi_match':
                    {
                        'query': 'windows or python?',
                        'fields': ['text', 'question', 'course', 'title'],
                        'type': 'best_fields'
                    }
            },
            'filter': {
                'term': {
                    'course': 'data-engineering-zoomcamp'
                }
            }
        }
    }
)
res['hits']['hits']

[{'_index': 'course-questions',
  '_id': 'pJaT3ZABBlcff5rxxxg7',
  '_score': 0.7125963,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}},
 {'_index': 'course-questions',
  '_id': 'P5aT3ZABBlcff5rx8Bob',
  '_score': 0.6609763,
  '_source': {'question': "Any particular hardware requirements for the course or everything is mostly cloud? TIA! Couldn't really find this in the FAQ.",
   'course': 'machine-learning-zoomcamp',
   'section': 'General course-related questions',
   'text': 'For the Machine Learning part, all you need is a working laptop with an internet connection. The Deep Learning part is more resource intensive, but for that you can use a cloud (we use Saturn cloud but can be anything else).\n(Rileen Sinha; based on response b

In [49]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000,
    "filter": {
      "term": {
        'course': 'data-engineering-zoomcamp'
      }
    }
}

In [50]:
res = es_client.search(
    index=index_name,
    knn=query,
    source=['text', 'section', 'question', 'course'],
    size=5,
    explain=True
)

res['hits']['hits']

[{'_shard': '[course-questions][0]',
  '_node': 'i1quItNYTzqNniLgB2eFjQ',
  '_index': 'course-questions',
  '_id': 'pJaT3ZABBlcff5rxxxg7',
  '_score': 0.7125963,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'},
  '_explanation': {'value': 0.0,
   'description': 'not in top k documents',
   'details': []}},
 {'_shard': '[course-questions][0]',
  '_node': 'i1quItNYTzqNniLgB2eFjQ',
  '_index': 'course-questions',
  '_id': 'lZaT3ZABBlcff5rxxRjD',
  '_score': 0.6373058,
  '_source': {'question': 'Environment - Should I use my local machine, GCP, or GitHub Codespaces for my environment?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'You can set it up on your laptop or PC if you prefe