In [2]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_community.embeddings import HuggingFaceBgeEmbeddings 
from langchain_text_splitters import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader

USER_AGENT environment variable not set, consider setting it to identify your requests.


# URL

In [17]:
loader = WebBaseLoader("https://www.myscheme.gov.in/schemes/pmfdr")
data_url = loader.load()
text_splitter_url = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
all_splits_url = text_splitter_url.split_documents(data_url)

In [18]:
data_url

[Document(metadata={'source': 'https://www.myscheme.gov.in/schemes/pmfdr', 'language': 'en'}, page_content='Something went wrong. Please try again later.OkAre you sure you want to sign out?CancelSign Out||EnglishEnglishThemeSign In©2025Powered byDigital India Corporation(DIC)Ministry of Electronics & IT (MeitY)Government of India®Quick LinksAbout UsContact UsScreen ReaderAccessibility StatementFrequently Asked QuestionsDisclaimerTerms & ConditionsUseful LinksGet in touch4th Floor, NeGD, Electronics Niketan, 6 CGO Complex, Lodhi Road, New Delhi - 110003, Indiasupport-myscheme[at]digitalindia[dot]gov[dot]in(011) 24303714 (9:00 AM to 5:30 PM)Last Updated On : 22/10/2025 | v-2.2.27')]

# PDF

In [4]:
pdfreader = PdfReader('Kia_EV6.pdf')
from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap  = 200
)
texts = text_splitter.split_text(raw_text)

In [19]:
embeddings = HuggingFaceBgeEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',
                                          model_kwargs={'device': 'cpu'})
vectorstore = FAISS.from_documents(all_splits_url, embeddings)

In [20]:
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key = 'AIzaSyAIHs58KG6-qR0G66w2K0lb51qkRJYMAVk'
)

In [11]:
messages = [
    (
        "system",
        "You are a helpful assistant that give information about indian cricket players.",
    ),
    ("human", "Who is virat kohli?"),
]
ai_msg = llm.invoke(messages)
ai_msg.content

'Virat Kohli is an **Indian international cricketer** widely regarded as one of the greatest batsmen of all time.\n\nHere\'s a breakdown of who he is:\n\n1.  **Role in Cricket:** He is a **right-handed top-order batsman**. He formerly captained the Indian national team in all three formats of the game (Test, One Day International, and Twenty20 International).\n\n2.  **Early Life & Debut:**\n    *   Born in Delhi, India, on November 5, 1988.\n    *   He captained India to victory at the 2008 ICC Under-19 Cricket World Cup.\n    *   He made his ODI debut for India in August 2008, followed by his Test debut in 2011 and T20I debut in 2010.\n\n3.  **Playing Style & Strengths:**\n    *   Known for his aggressive yet technically sound batting.\n    *   He is a prolific run-scorer across all formats, particularly renowned for his ability to chase down targets in ODIs.\n    *   He possesses immense mental fortitude, exceptional fitness, and a highly competitive spirit.\n\n4.  **Records & Achiev

In [21]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [22]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [23]:
# Create a LANGSMITH_API_KEY in Settings > API Keys
from langsmith import Client
client = Client(api_key='')
prompt = client.pull_prompt("rlm/rag-prompt", include_model=True)

In [24]:
qa_chain = (
    {
        "context": vectorstore.as_retriever() | format_docs,
        "question": RunnablePassthrough(),
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [25]:
qa_chain.invoke("What is this scheme all about?")

'Based on the provided context, it is not possible to determine what the scheme is all about. The text includes contact information, website navigation links, copyright details, and error messages, but it does not describe the purpose or nature of any specific scheme.'