In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)


In [2]:
from sentence_transformers import SentenceTransformer



In [3]:
# Load https://huggingface.co/sentence-transformers/all-mpnet-base-v2
model = SentenceTransformer("all-mpnet-base-v2")


In [4]:
embeddings = model.encode("The weather is lovely today")

In [5]:
len(embeddings)

768

In [6]:
operations = []
for doc in documents:
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

In [7]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch(
    hosts=["http://localhost:9200"],    # timeout côté client
)

In [8]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
            
        }
    }
}



In [9]:
index_name = "course-questions"
es_client.indices.delete(index=index_name, ignore_unavailable=True)

es_client.indices.create(index=index_name, body=index_settings, request_timeout=90)

  es_client.indices.create(index=index_name, body=index_settings, request_timeout=90)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [10]:
from tqdm.auto import tqdm
for doc in tqdm(operations):
    try:
        
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(f"Error indexing document {doc['question']}: {e}")


  0%|          | 0/948 [00:00<?, ?it/s]

In [11]:
search_term = "I just joined the course"
vector_search_term = model.encode(search_term)

# semantic search

In [12]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 1000
}

In [13]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])

In [14]:
res["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'OA2TJpcBuAv6xyZvEPU4',
  '_score': 0.776579,
  '_source': {'question': 'Is it going to be live? When?',
   'course': 'machine-learning-zoomcamp',
   'section': 'General course-related questions',
   'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.'}},
 {'_index': 'course-questions',
  '_id': 'vQ2TJpcBuAv6xyZvF_bD',
  '_score': 0.75693864,
  '_source': {'question': 'IBM Cloud an alternative for AWS',
   'course': 'mlops-zoomcamp',
   'section': 'Module 1: Introduction',
   'text': 'You can get invitation code by coursera and use it in account to verify it it has different characteristics.\nI really love it\nhttps://www.youtube.com/watch?v=h_GdX6KtXjo'}},
 {'_in

In [15]:
response = es_client.search(
    index=index_name,
    query  = {
        "bool":{
            "must":{
                "multi_match":
                    {
                        "query":"windows or pythons",
                        "fields":["text", "question", "course", "title"],
                        "type": "best_fields"
                    }
            },
            "filter":{
                "term":{
                    "course":"data-engineering-zoomcamp"
                }
            }
        }
    }
)

In [None]:
response["hits"]["hits"]

[{'_index': 'course-questions',
  '_id': 'vQ2TJpcBuAv6xyZvB_PK',
  '_score': 5.87179,
  '_source': {'text': "It is recommended by the Docker do\n[Windows 10 / 11 Home Edition] If you're running a Home Edition, you can still make it work with WSL2 (Windows Subsystem for Linux) by following the tutorial here\nIf even after making sure your WSL2 (or Hyper-V) is set up accordingly, Docker remains stuck, you can try the option to Reset to Factory Defaults or do a fresh install.",
   'section': 'Module 1: Docker and Terraform',
   'question': 'Should I run docker commands from the windows file system or a file system of a Linux distribution in WSL?',
   'course': 'data-engineering-zoomcamp',
   'text_vector': [-0.08411327749490738,
    0.004404706880450249,
    0.013582300394773483,
    -0.06397324800491333,
    -0.0015681928489357233,
    0.01988597773015499,
    -0.016451604664325714,
    -0.0013373984256759286,
    0.03985007107257843,
    0.03357085585594177,
    0.035942789167165756,
  

# advanced semantic search

In [21]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k":5,
    "num_candidates":10000
}

response = es_client.search(
    index = index_name,
    query={
        
        "match": {
            "course": "data-engineering-zoomcamp"
        },
    },
    knn=knn_query,
    size = 5,
    explain=True
    
)

In [22]:
response["hits"]["hits"]

[{'_shard': '[course-questions][0]',
  '_node': 'wW926Ec0To6eOqOjmhuoBA',
  '_index': 'course-questions',
  '_id': 'hA2TJpcBuAv6xyZvBfPX',
  '_score': 1.5158405,
  '_source': {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
   'section': 'General course-related questions',
   'question': 'Course - When will the course start?',
   'course': 'data-engineering-zoomcamp',
   'text_vector': [-0.03570346161723137,
    -0.06891428679227829,
    -0.04448342323303223,
    0.006250153295695782,
    -0.042247094213962555,
    -0.00645399559289217,
    0.02754814364016056,
    