In [1]:
import json
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [2]:
documents = []

for c_d in docs_raw:
    for doc in c_d['documents']:
        doc['course'] = c_d['course']
        documents.append(doc)

In [3]:
from sentence_transformers import SentenceTransformer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
model = SentenceTransformer("all-mpnet-base-v2")

In [5]:
model.encode("This is a simple sentence")



array([ 4.44873376e-03, -7.61314183e-02, -3.77453980e-04,  7.52526894e-03,
       -3.80979292e-02,  3.80131155e-02, -9.73011088e-03, -5.05400449e-03,
       -9.37979762e-03,  1.23888031e-02,  4.91276830e-02,  1.52209625e-02,
        3.80008109e-02, -6.41802624e-02,  9.42126289e-03, -5.19748852e-02,
        9.08066258e-02,  1.71115380e-02,  1.62125621e-02,  2.98866071e-02,
        1.50540820e-03,  8.35080538e-03,  3.78842093e-02, -1.01192864e-02,
        6.46108063e-03,  3.97538788e-05, -1.45217525e-02, -1.88468471e-02,
       -3.74039710e-02, -1.51666044e-03, -1.02680456e-02, -3.68062854e-02,
        2.36677546e-02, -6.46023080e-02,  1.96967039e-06, -5.01107750e-03,
       -2.80828006e-03, -1.92073788e-02, -8.65119994e-02,  2.83464752e-02,
       -5.38667291e-02,  3.63705717e-02, -2.26468258e-02,  2.87367776e-02,
       -1.32342391e-02,  1.08689599e-01,  3.70518900e-02,  3.38802189e-02,
       -5.30679263e-02,  3.61782722e-02, -1.35724701e-03, -3.63483280e-02,
       -2.78346594e-02, -

In [6]:
print(len(_))

768


In [7]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [8]:
operations = []
for doc in documents:
    # transfrom the title into an embedding
    doc["text_vector"] = model.encode(doc['text']).tolist()
    operations.append(doc)

In [9]:
operations[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'text_vector': [-0.04103041812777519,
  0.025834158062934875,
  -0.03680184856057167,
  -0.020898327231407166,
  -0.020596275106072426,
  0.009353749454021454,
  -0.0033317022025585175,
  -0.009491887874901295,
  0.030117960646748543,
  0.01908211223781109,
  0.012690017931163311,
  -0.01707877218723297,
  -0.0016324581811204553,
  0.12997248768806458,
  0.03096918575465679,
  -0.025823727250099182,
  0.02782304398715496,
  0.02515977807343006,
  -0.0808122381567955,
  -0.003617348847910762,
  -0.008902021683752537,
  0.003404838964343071,
  -0.023009272292256355,
  -0.034045297652482986,
  0.02459859289228916,
  0.013545563444495201,
  -0.025439007207751274,
  0.011951071210205555,
  -0.020540110766887665,
  -0.010077393613755703,
  0.020575352013111115,
 

In [12]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch(
    ['http://localhost:9200'],
    verify_certs=False,  # Set to True if you have a valid certificate
    ssl_show_warn=False  # Optional: Suppresses SSL warnings
)

# Test the connection
try:
    info = es_client.info()
    print(info)
except Exception as e:
    print(f"Error connecting to Elasticsearch: {e}")

{'name': 'b87d50e98ecd', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'n5M-slVMQ-GK-6lRF9GAGA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [13]:
index_name = "course_questions"
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector": {
                "type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"} 
        }
    }
}

In [14]:
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course_questions'})

In [15]:
for doc in operations:
    try:
        es_client.index(index=index_name, document=doc)
    except:
        print(e)

## Basic Semantic search

In [16]:
search_term = "windoes or mac?"
vector_search_term = model.encode(search_term)

In [17]:
query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000,
}

In [18]:
res = es_client.search(index=index_name, knn=query, source=["text", "section", "question", "course"])
res["hits"]["hits"]

[{'_index': 'course_questions',
  '_id': 'wcKXq5ABMRiz3UObYCyO',
  '_score': 0.69488895,
  '_source': {'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'section': 'General course-related questions',
   'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully'}},
 {'_index': 'course_questions',
  '_id': 'S8KXq5ABMRiz3UObei8d',
  '_score': 0.6090768,
  '_source': {'question': 'What if your accuracy and std training loss don’t match HW?',
   'course': 'machine-learning-zoomcamp',
   'section': '8. Neural Networks and Deep Learning',
   'text': 'Problem:\nI found running the wasp/bee model on my mac laptop had higher reported accuracy and lower std deviation than the HW answers. This may be because of the SGD optimizer. Running this on my mac printed a message about a new and legacy version that could be used.\nSolution:\nTry running the same code on google col

## Textual search

In [21]:
response = es_client.search(
    index=index_name,
    query={
        "bool": {
            "must": {
                "multi_match": {
                    "query": "windows or python?",
                    "fields": ["text", "question", "course", "title"],
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
)

In [25]:
knn_query = {
    "field": "text_vector",
    "query_vector": vector_search_term,
    "k": 5,
    "num_candidates": 10000,
}

response = es_client.search(
    index=index_name,
    query={
        "match": {
            "course": "data-engineering-zoomcamp",
        },
    },
    knn=knn_query,
    size=5,
    explain=True,
)

In [26]:
response["hits"]["hits"]

[{'_shard': '[course_questions][0]',
  '_node': '1rdKhqTWR0G6LcCHl4OEaw',
  '_index': 'course_questions',
  '_id': 'wcKXq5ABMRiz3UObYCyO',
  '_score': 1.4738029,
  '_source': {'text': 'Yes! Linux is ideal but technically it should not matter. Students last year used all 3 OSes successfully',
   'section': 'General course-related questions',
   'question': 'Environment - Is the course [Windows/mac/Linux/...] friendly?',
   'course': 'data-engineering-zoomcamp',
   'text_vector': [-0.026965469121932983,
    -0.0006261463859118521,
    -0.016629496589303017,
    0.052851490676403046,
    0.054765257984399796,
    -0.03133990615606308,
    0.02994258515536785,
    -0.04808564856648445,
    0.04467553272843361,
    0.00583944097161293,
    0.01623307541012764,
    0.012001145631074905,
    -0.031222272664308548,
    0.016600526869297028,
    -0.048869017511606216,
    -0.06496305763721466,
    0.04643421620130539,
    -0.009297734126448631,
    -0.06425285339355469,
    -0.01373270619660616