# Simple Semantic Search

 notes following the [3.2 video](https://www.youtube.com/watch?v=ptByfB_YcEg&ab_channel=DataTalksClub%E2%AC%9B)

# Step 0: Start up docker for ElasticSearch
```
docker run -it \
    --rm \
    --name elasticsearch \
    -m 4GB \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

# Step 1: Load Documents

In [None]:
# download documents
!wget -nc https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/01-intro/documents.json

# load documents.json and flatten them
import json
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []
for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

# Step 2: Create Embeddings using Pretrained Models

In [None]:
import numpy as np
np.float_ = np.float64

from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-mpnet-base-v2")
model.encode('simple sentence')

In [None]:
# create embeddings for the documents
operations = []
for doc in documents:
    doc["text_vector"] = model.encode(doc["text"]).tolist()
    operations.append(doc)

# Step 3: Create an ElasticSearch Index

In [None]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

es_client = Elasticsearch('http://localhost:9200')

# create mapping
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "text_vector": {
                "type": "dense_vector", 
                "dims": 768, 
                "index": True, 
                "similarity": "cosine"
                },
        }
    }
}

# create index
index_name = "course-questions"

es_client.indices.delete(index=index_name,ignore_unavailable=True)
es_client.indices.create(index=index_name,body=index_settings)

# add documents to index
for doc in tqdm(operations):
    try: 
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(e)

# Step 5: Create end user query

In [None]:
search_term = 'windows or mac?'
search_term_vector = model.encode(search_term)

query = {
    "field": "text_vector",
    "query_vector": search_term_vector,
    "k": 5,
    "num_candidates": 1000
}

res = es_client.search(
    index=index_name, 
    knn=query, 
    source=["text", "section", "question", "course"]
    )

res["hits"]["hits"]