In [None]:
#!pip install elasticsearch transformers torch sentence-transformers numpy 
# !pip install elasticsearch==8.13.0
!pip uninstall elasticsearch

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np




model = SentenceTransformer(
    'intfloat/multilingual-e5-base',
    use_auth_token=HUGGING_FACE_TOKEN
)


def chunk_text(text, max_length=400):
    words = text.split()
    chunks = []
    current_chunk = []
    current_length = 0

    for word in words:
        current_length += len(word) + 1
        if current_length > max_length:
            chunks.append(' '.join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1
        else:
            current_chunk.append(word)
    if current_chunk:
        chunks.append(' '.join(current_chunk))
    return chunks

# Read markdown file
with open('parsed-doc/Sukanya Samriddhi Account Scheme 2019 English (1)-with-image-links.md', 'r', encoding='utf-8') as file:
    text = file.read()


chunks = chunk_text(text)
embeddings = []
for chunk in chunks:
    embedding = model.encode(f"passage: {chunk}", normalize_embeddings=True)
    embeddings.append(embedding.tolist())


np.save('embeddings.npy', embeddings)

print(f"Number of chunks: {len(chunks)}")
print(f"Sample embedding dimension: {len(embeddings[0])}")

In [None]:
from elasticsearch import Elasticsearch


es = Elasticsearch(["http://localhost:9200"])


mappings = {
    "properties": {
        "text": {"type": "text"},
        "embedding": {
            "type": "dense_vector",
            "dims": 768,  
            "index": True,
            "similarity": "cosine"
        },
        "language": {"type": "keyword"}
    }
}


es.indices.create(index="markdown_vectors", body={"mappings": mappings}, ignore=400)

print("Index 'markdown_vectors' created with 768-dimensional dense_vector field.") 

In [None]:
from elasticsearch import Elasticsearch


es = Elasticsearch(["http://localhost:9200"])


embeddings = np.load('embeddings.npy')


with open('parsed-doc/Sukanya Samriddhi Account Scheme 2019 English (1)-with-image-links.md', 'r', encoding='utf-8') as file:
    text = file.read()
chunks = chunk_text(text)  


for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
    doc = {
        "text": chunk,
        "embedding": embedding.tolist(), 
        "language": "english" if all(ord(c) < 128 for c in chunk) else "hindi"
    }
    es.index(index="markdown_vectors", id=i+1, body=doc)


print(f"Indexed {len(chunks)} documents into 'markdown_vectors'.")

In [None]:
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch


model = SentenceTransformer('intfloat/multilingual-e5-base')
es = Elasticsearch(["http://localhost:9200"])



queries = [
    "query: What is the Sukanya Samriddhi Account Scheme?",
    "query: Who can open a Sukanya Samriddhi account?",
    "query: What is the interest rate for Sukanya Samriddhi account?",
    "query: How to withdraw money from Sukanya Samriddhi account?",
    "query: सुकन्या समृद्धि खाते की ब्याज दर क्या है?",
    "query: सुकन्या समृद्धि खाता कब बंद किया जा सकता है?",
    "query: Kavya Kapoor's first post",
    "query: Haqdarshak agent training details"
]


for q in queries:
    query_embedding = model.encode(q, normalize_embeddings=True).tolist()
    search_query = {
        "knn": {
            "field": "embedding",
            "query_vector": query_embedding,
            "k": 2,
            "num_candidates": 10
        }
    }
    response = es.search(index="markdown_vectors", body=search_query)
    
    print(f"\nSearch results for: {q}")
    for hit in response["hits"]["hits"]:
        print(f"Score: {hit['_score']}")
        print(f"Text: {hit['_source']['text'][:120]}...")
        print("\n\n---")
