In [None]:
import os
import numpy as np
from dotenv import load_dotenv

from FlagEmbedding import BGEM3FlagModel

from langchain.text_splitter import RecursiveCharacterTextSplitter

In [10]:
corpus_path = r"C:\Users\piyus\OneDrive\Documents\Github Repo\Inquiry-Assistant\Context\clean_text.txt"

with open(corpus_path, "r", encoding="utf-8") as f:
    full_text = f.read()

In [11]:
env_path = r"C:\Users\piyus\OneDrive\Documents\Github Repo\.env"
load_dotenv(dotenv_path=env_path)

os.environ["GOOGLE_API_KEY"] = os.getenv('GOOGLE_API_KEY')

os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")

os.environ["HUGGINGFACE_HUB_TOKEN"] = os.getenv('HUGGINGFACE_HUB_TOKEN')

os.environ['LANGCHAIN_API_KEY'] = os.getenv('LANGCHAIN_API_KEY')
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_PROJECT'] = 'inquiry-assistant'

os.environ['GROQ_API_KEY'] = os.getenv('GROQ_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')  

In [12]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    separators=["\n\n", "\n", ".", " ", "--- --- "]
)

documents = splitter.create_documents([full_text])

In [None]:
class BGEEmbedding:
    def __init__(self):
        self.model = BGEM3FlagModel('BAAI/bge-base-en', use_fp16=False)

    def embed_documents(self, texts):
        texts = ["passage: " + t for t in texts]
        output = self.model.encode(texts)
        dense_vecs = output["dense_vecs"]
        dense_vecs = dense_vecs / np.linalg.norm(dense_vecs, axis=1, keepdims=True)
        return dense_vecs.tolist() 
    

    def embed_query(self, text):
        output = self.model.encode(["query: " + text])
        dense_vec = output["dense_vecs"][0]
        dense_vec = dense_vec / np.linalg.norm(dense_vec)
        return dense_vec.tolist()
       

embedding_function = BGEEmbedding()


Fetching 19 files: 100%|██████████| 19/19 [00:00<?, ?it/s]


In [15]:
from pinecone import Pinecone

pc = Pinecone(api_key="pcsk_5mHj66_Fdukwnj1rJeiEr1nq5VTDuURJyLGZ95U9SCnmuecj4RN7V2qY7B6wvyhtnWcPYH")
index = pc.Index('rag-application')

In [16]:
texts = [doc.page_content for doc in documents]
vectors = embedding_function.embed_documents(texts)
upsert_data = [
    {
        "id": f"doc-{i}",
        "values": vectors[i],
        "metadata": {"text": texts[i]}  # or use doc.metadata if needed
    }
    for i in range(len(texts))
]

index.upsert(vectors=upsert_data)


pre tokenize: 100%|██████████| 3/3 [00:00<00:00, 15.23it/s]
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 3/3 [01:29<00:00, 29.96s/it]


{'upserted_count': 610}

In [24]:
from langchain_core.runnables import RunnableLambda, RunnableMap
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_core.documents import Document

# Step 3: Define a simple retriever using Pinecone
def pinecone_retriever(question, k=5):
    query_vector = embedding_function.embed_query(question)
    results = index.query(vector=query_vector, top_k=k, include_metadata=True)
    return [
        Document(page_content=match['metadata']['text']) 
        for match in results['matches']
    ]

retriever_runnable = RunnableLambda(lambda x: pinecone_retriever(x["question"]))


In [25]:
# Step 4: Prompt Template
chat_prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "You are an intelligent assistant for Scaler Academy, trained on internal documents, placement records, program curricula, and student feedback. "
     "You should answer questions only based on the context provided. "
     "If the answer is not found in the context, reply with: "
     "“I'm sorry, I couldn't find that information in the available documents.” "
     "Be precise, concise, and maintain a professional and helpful tone."
    ),
    ("human", 
     "Context:\n{context}\n\nQuestion:\n{question}")
])

In [26]:
# Step 5: Format context
def format_inputs(inputs):
    return {
        "context": "\n\n".join([doc.page_content for doc in inputs["documents"]]),
        "question": inputs["question"]
    }


In [27]:
# Step 6: LLM and Output Parser
llm = ChatGroq(groq_api_key=groq_api_key, model="gemma2-9b-it")
output_parser = StrOutputParser()


In [28]:
# Step 7: Build the RAG Chain
rag_chain = (
    RunnableMap({
        "documents": retriever_runnable,
        "question": lambda x: x["question"]
    })
    | format_inputs 
    | chat_prompt 
    | llm 
    | output_parser
)

In [34]:
# Step 8: Use the RAG Chain
query = input("Enter your question: ")
print("\nQuestion:", query)

response = rag_chain.invoke({"question": query})
print("\nAnswer:", response)



Question: why should i choose scaler

Answer: Scaler is right for you because:

* **It's designed for learning:** Scaler provides a structured learning environment with intuitive classes and thorough explanations. 
* **It has a stellar track record:** Scaler has helped 15k+ learners transform their careers since 2019.
* **It offers placement assistance:** Scaler provides support to help learners secure jobs in the industry.
* **You'll be part of a community:** You'll learn alongside others who are on the same path.
* **You'll have access to top 1% industry mentors:** Scaler connects learners with experienced mentors who can guide their journey. 



