In [18]:
import warnings
warnings.filterwarnings('ignore')

In [19]:
import os
from llama_index.core import SimpleDirectoryReader

In [20]:
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [21]:
documents = SimpleDirectoryReader(
    input_files=["./data/kanha_122mm0924.pdf"]
).load_data()

In [22]:
print(type(documents), "\n")
print(len(documents), "\n")
print(type(documents[0]))


<class 'list'> 

1 

<class 'llama_index.core.schema.Document'>


In [23]:
from llama_index.core import Document

document = Document(text="\n\n".join([doc.text for doc in documents]))
print(len(document.text))

3385


WINDOW SENTENCE RETRIEVAL


In [24]:
from llama_index.core.node_parser import SentenceWindowNodeParser

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=5,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

In [25]:

from llama_index.llms.groq import Groq

llm = Groq(model="llama3-70b-8192", temperature=0.1)


In [26]:
from llama_index.core import VectorStoreIndex
from llama_index.embeddings.huggingface import HuggingFaceEmbedding 
embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")





In [27]:
import os
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.settings import Settings

# Apply settings globally (recommended by LlamaIndex)
Settings.llm = llm
Settings.embed_model = embedding_model
Settings.node_parser = node_parser

# Define index path
index_path = "./sentence_index"

# Check if index exists
if not os.path.exists(index_path):
    # Build index and persist
    index = VectorStoreIndex.from_documents(documents)
    index.storage_context.persist(persist_dir=index_path)
else:
    # Load index from storage
    storage_context = StorageContext.from_defaults(persist_dir=index_path)
    index = load_index_from_storage(storage_context)


In [28]:
from llama_index.core.indices.postprocessor import MetadataReplacementPostProcessor

postproc = MetadataReplacementPostProcessor(
    target_metadata_key="window"
)

In [29]:
from llama_index.core.indices.postprocessor import SentenceTransformerRerank

# BAAI/bge-reranker-base
# link: https://huggingface.co/BAAI/bge-reranker-base
rerank = SentenceTransformerRerank(
    top_n=2, model="BAAI/bge-reranker-base"
)

In [30]:
sentence_window_engine = index.as_query_engine(
    similarity_top_k=6, node_postprocessors=[postproc, rerank]
)

In [31]:
window_response = sentence_window_engine.query(
    "what is kanhaiya's educational qualification"
)

In [32]:
print(window_response)

Kanhaiya's educational qualifications are Bachelor of Technology in Metallurgical and Materials Engineering with a CGPA of 7.24 from National Institute of Technology, Rourkela, and he is currently in Diploma Level of B.Sc. in Data Science from Indian Institute of Technology, Madras.


PUTTING EVERYTHING TOGETHER AND USING LLM 


In [33]:
import os
from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.node_parser import SentenceWindowNodeParser
from llama_index.core.postprocessor import MetadataReplacementPostProcessor, SentenceTransformerRerank
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.settings import Settings


def build_sentence_window_index(
    documents,
    llm,
    sentence_window_size=3,
    save_dir="sentence_index"
):
    # Set up node parser
    node_parser = SentenceWindowNodeParser.from_defaults(
        window_size=sentence_window_size,
        window_metadata_key="window",
        original_text_metadata_key="original_text",
    )

    # Use HuggingFace for embedding
    embedding_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

    # Apply global settings (replaces ServiceContext)
    Settings.llm = llm
    Settings.embed_model = embedding_model
    Settings.node_parser = node_parser

    # Load or build index
    if not os.path.exists(save_dir):
        index = VectorStoreIndex.from_documents(documents)
        index.storage_context.persist(persist_dir=save_dir)
    else:
        storage_context = StorageContext.from_defaults(persist_dir=save_dir)
        index = load_index_from_storage(storage_context)

    return index


def get_sentence_window_query_engine(index, similarity_top_k=6, rerank_top_n=2):
    # Postprocessors
    postproc = MetadataReplacementPostProcessor(target_metadata_key="window")
    rerank = SentenceTransformerRerank(
        top_n=rerank_top_n,
        model="BAAI/bge-reranker-base"
    )

    return index.as_query_engine(
        similarity_top_k=similarity_top_k,
        node_postprocessors=[postproc, rerank]
    )


In [34]:
index = build_sentence_window_index(
    [document],
    llm=llm,
    save_dir="./sentence_index"
)

Loading llama_index.core.storage.kvstore.simple_kvstore from ./sentence_index\docstore.json.
Loading llama_index.core.storage.kvstore.simple_kvstore from ./sentence_index\index_store.json.


In [35]:
query_engine = get_sentence_window_query_engine(index, similarity_top_k=6)


In [36]:
print(query_engine.query(". explain in brief"))

This is a resume of Kanhaiya Goyal, showcasing his education, technical skills, relevant coursework, and work experience as a full-stack developer intern at VegaPro.ai.


In [37]:
print(query_engine.query(" What types of limitations are faced during the nanomaterials fabrication? Discuss each one of them in brief.   "))

The query seems to be unrelated to the provided context information. However, I'll attempt to provide a response based on the context.

Since the context information is related to a person's education, technical skills, and experience in the field of data science and web development, it's challenging to provide a direct answer to the query about nanomaterials fabrication limitations.

However, considering the person's educational background in Metallurgical and Materials Engineering, it's possible to provide a general response.

In the field of nanomaterials fabrication, some common limitations include:
1. **Scalability**: Fabricating nanomaterials at a large scale while maintaining their unique properties is a significant challenge.
2. **Uniformity**: Achieving uniformity in the size, shape, and properties of nanomaterials is crucial, but it can be difficult to control.
3. **Interfacial issues**: The interface between nanomaterials and other materials can lead to issues such as agglom

In [41]:
print(query_engine.query(" what are the skills that kanhaiya has? and explain me about his experience and is he good fit for a cloud management roll  "))

Based on the provided information, Kanhaiya has the following skills:

* Programming Languages: C/C++, JavaScript
* Frameworks: NodeJs, ExpressJs, ReactJs, NextJs, Tailwind CSS, shadCnc, Redux, streamlit
* Databases: PostgreSQL, MongoDB, MySql
* Developer Tools: Git, GitHub, Docker, VS Code, Figma, Adobe Illustrator, Canva
* Libraries: scikit-learn, tensorflow, light-bgm
* Relevant Coursework: Data Structures, PowerBi, PostgreSQL, Statistics, Machine Learning, Web Development, Exploratory Data Analysis

Regarding his experience, Kanhaiya has worked as a Full Stack Developer Intern at VegaPro.ai from May 2025 to present. During this period, he developed a full-fledged poultry management web application with production-level architecture, built the frontend using Next.js, and designed and implemented the backend in Flask. He also utilized PostgreSQL for scalable data storage, optimizing queries to enhance performance, and managed core admin functionalities.

As for whether he is a good f