In [None]:
import json
import glob
import requests
from tqdm.notebook import tqdm

In [None]:
# Elasticsearch endpoint
es_url = "http://elasticsearch:9200"
index_name = "documents"

# Define the index settings and mappings
index_config = {
    "settings": {
        "index": {
            "similarity": {
                "custom_bm25": {
                    "type": "BM25",
                    "k1": 1.5,
                    "b": 0.75,
                    "discount_overlaps": True
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "content": {
                "type": "text",
                "similarity": "custom_bm25"
            }
        }
    }
}

# Create the index
response = requests.put(
    f"{es_url}/{index_name}",
    headers={"Content-Type": "application/json"},
    data=json.dumps(index_config)
)

# Output the response
print(f"Status Code: {response.status_code}")
print(f"Response Body: {response.text}")

In [None]:
files = glob.glob("/home/jovyan/data/docs/**")

for file in tqdm(files):

    id_ = file.split("/")[-1].split(".")[0]
    with open(file, "r") as f:
        text = "".join(f.readlines())

    response = requests.post(
            f"{es_url}/{index_name}/_doc/{id_}",
            headers={"Content-Type": "application/json"},
            data=json.dumps({"content": text})
        )


In [None]:
response.text


In [None]:
# Sample documents
documents = [
    {"content": "Elasticsearch provides powerful search capabilities."},
    {"content": "BM25 is a ranking function used in information retrieval."},
    {"content": "Customizing BM25 parameters can improve search relevance."}
]

# Index documents
for i, doc in enumerate(documents, start=1):
    response = requests.post(
        f"{es_url}/{index_name}/_doc/{i}",
        headers={"Content-Type": "application/json"},
        data=json.dumps(doc)
    )
    print(f"Indexed document {i}: Status Code {response.status_code}")

In [None]:
# Define the search query
search_query = {
    "query": {
        "match": {
            "content": "search capabilities"
        }
    }
}

# Execute the search
response = requests.get(
    f"{es_url}/{index_name}/_search",
    headers={"Content-Type": "application/json"},
    data=json.dumps(search_query)
)

# Output the search results
print(f"Search Status Code: {response.status_code}")
print(f"Search Results:\n{response.text}")

In [None]:
response.json()