In [4]:
from langchain_community.document_loaders import (
    BSHTMLLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings


In [11]:
# import os
# from getpass import getpass

# HF_TOKEN = getpass("HF Token:")
# os.environ["HUGGINGFACEHUB_API_TOKEN"] = HF_TOKEN
from dotenv import load_dotenv

load_dotenv()

import os

huggingface = os.getenv("huggingface")
qudrant_key = os.getenv("qudrant_key")
qudrant_url = os.getenv("qudrant_url")
OpenAI = os.getenv("OpenAI")
print(huggingface, qudrant_key, qudrant_url, OpenAI)

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 axZG5S1RWjcm2fOMGfZ4fK-vBnCduDnAFdFTf0omx9jiNFTkBsfd8A https://d72d961b-81eb-48a6-ad16-e0beef6a4757.us-east4-0.gcp.cloud.qdrant.io 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


In [5]:
import os

folder_path = "./toc_notifications_2023_1991/rbi_notification_2023_1991"
files = os.listdir(folder_path)

loaders = [
    BSHTMLLoader(os.path.join(folder_path, file))
    for file in files
    if file.endswith(".html")
]
# content = [loader.load() for loader in loaders]
content = [doc for loader in loaders for doc in loader.load()]

In [6]:
# import pickle as pk

# Your existing code
text_split = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
chunks = text_split.split_documents(content)

# Save chunks to a file
# with open("chunks.pkl", "wb") as f:
#     pk.dump(chunks, f)

In [None]:
import pickle

with open('chunks.pkl', 'rb') as f:
    chunks = pickle.load(f)

# chunks[0].page_content #226480
len(chunks)

In [None]:
import tiktoken
import pickle

def num_tokens_from_chunks(chunks: list, encoding_name: str) -> list:
    """Returns the number of tokens in each chunk."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens_list = []
    for chunk in chunks:
        try:
            num_tokens_list.append(len(encoding.encode(chunk.page_content)))
        except Exception as e:
            print(f"Skipping chunk due to error: {e}")
    return num_tokens_list

# Load chunks
with open('chunks.pkl', 'rb') as f:
    chunks = pickle.load(f)

# Calculate number of tokens for each chunk
num_tokens_list = num_tokens_from_chunks(chunks, "cl100k_base")

total_tokens = sum(num_tokens_list)  #40622659
# Print the list of token counts
print(total_tokens)

In [14]:
embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=huggingface,
    model_name="BAAI/bge-base-en-v1.5",
)

vector_store = Qdrant.from_documents(
    chunks,
    embeddings,
    url=qudrant_url,
    api_key=qudrant_key,
    collection_name="rbi_notification",
)

In [16]:
search = vector_store.similarity_search("how much was the amount for Auction held on August 18, 2004")
search

[Document(page_content='of Rs.250 crore: Auction to be held on August 18, 2004 - Annexure I(a) 7 kbTender for “6.18 percent Government Stock, 2005” for an aggregate amount of Rs.5,000 crore under MSS : Auction to be held  on August 18,2004 - Annexure I 9 kbAuction for Sale (re-issue ) of ‘6.18 per cent Government Stock, 2005’ under Market Stabilisation Scheme 6 kbAuction of Government of India Dated Securities under Market Stabilisation Scheme (MSS) 6 kbSubmission of progress reports and change in the periodicity of returns under Swarnjayanti Gram Swarozgar Yojana (SGSY) 581 kbAug 16, 2004Before the Supreme Court of India - Interlocutory Application Nos. 9, 20 & 28 –', metadata={'source': './toc_notifications_2023_1991/rbi_notification_2023_1991\\2004.html', 'title': 'Reserve Bank of India - Notifications', '_id': 'd69cb498-ee22-4ed2-9d28-efeec64a28d2', '_collection_name': 'rbi_notification'}),
 Document(page_content='amount of Rs.3,000 crore : Auction to be held  on May 6, 2004 - Anne

In [17]:
retriever = vector_store.as_retriever(
    search_type="mmr",
    search_kwargs = {"k":2}
)

In [32]:
retriever.get_relevant_documents("what is the amount for Auction held on August 18, 2004")

[Document(page_content='amount of Rs.3,000 crore : Auction to be held  on May 6, 2004 - Annexure II 8 kbScheme for Non-competitive Bidding Facility in the Auctions of Government Securities - Annexure I 17 kbStandardised application form for investments in Savings Bonds along with the Duties and Rights of Customers 9 kbGuidelines on compliance with Accounting Standards (AS) by banks 16 kbApr 29, 2004Implementation of On-line Tax Accounting System with effect from June 1, 2004 8 kbApr 28, 2004Transactions in Government Securities 13 kbApr 27, 2004Half- yearly Reporting system in respect of SSI sick and non-SSI sick/weak industrial units 8 kbApr 24, 2004Rupee Export Credit Interest Rates 113 kbForeign Exchange Management (Deposit) Regulations, 2000 - Acceptance of Deposits - Revised Guidelines - A.P. (DIR Series) Circular No.89 33 kbApr 23, 2004Interest Rate Ceiling on Rupee Export Credit 6 kbDeclaration of dividends by banks 6 kbScheme for Non-competitive Bidding Facility in the Auctions

In [33]:
from langchain_community.llms import HuggingFaceHub

llm = HuggingFaceHub(
    huggingfacehub_api_token=huggingface,
    repo_id="huggingfaceh4/zephyr-7b-alpha",
    model_kwargs={"temperature": 0.1, "max_new_tokens": 512, "return_full_text": False},
)

In [38]:
query = "what is the amount for Auction held on August 18, 2004"

In [39]:
prompt = f"""
<|system|>
You are an AI assitant that follow instructions extremely well. Please be truthful and give direct answers
</s>
<|user|>
{query}
</s>
<|assistant|>
"""

In [40]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff",retriever=retriever)

In [41]:
response = qa.invoke(prompt)
response

{'query': '\n<|system|>\nYou are an AI assitant that follow instructions extremely well. Please be truthful and give direct answers\n</s>\n<|user|>\nwhat is the amount for Auction held on August 18, 2004\n</s>\n<|assistant|>\n',
 'result': '\n\nThe amount for Auction held on August 18, 2004 is not specified in the given context. The provided information only mentions that there were two auctions held on that date, one for an aggregate amount of Rs.250 crore and another for an aggregate amount of Rs.5,000 crore. The specific amount for each auction would depend on the final bids and the number of securities sold.'}

In [24]:
from qdrant_client import QdrantClient
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate

qdrant_client = QdrantClient(
    url="...",
    api_key="...",
)

embeddings = HuggingFaceInferenceAPIEmbeddings(
    api_key=huggingface,
    model_name="BAAI/bge-base-en-v1.5",
)
# query = "what is the amount for Auction held on August 18, 2004"
query = "what were the notification by rbi in year 2004?"

query_vector = embeddings.embed_query(query)
# Perform similarity search in Qdrant
search_result = qdrant_client.search(
    collection_name="rbi_notification",
    query_vector=query_vector,
    limit=1,  # Adjust the limit as needed
)

# Extract the relevant document from the search result
document = search_result[0].payload["page_content"]
source = search_result[0].payload["metadata"]

# Create an OpenAI instance
llm = OpenAI(openai_api_key="...")

# Define a prompt template
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer:

Context: {context}

Question: {question}

Only return the helpful answer. Answer must be detailed and well explained.
Helpful answer:
"""

# Create a prompt using the template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template=prompt_template,
)

# Generate the answer using the LLM
answer = llm(prompt.format(context=document, question=query))

print("Query:", query)
print("Relevant Text:", document)
print("Source:", source)
print("Generated Answer:", answer)

Query: what were the notification by rbi in year 2004?
Relevant Text: to the RBI Act, 1934 43 kbJul 31, 2009Auction for Sale (Re-issue ) of ‘7.40 per cent Government Stock, 2035' 18 kbAuction for Sale (Re-issue ) of ‘6.90 per cent Government Stock, 2019’ 16 kbAuction for Sale (Re-issue ) of ‘6.49 per cent Government Stock, 2015’ 16 kbAuction of Government of India Dated Securities 22 kbRupee Export Credit Interest Rates 632 kbJul 29, 2009NEFT System – Business Continuity Plan 11 kbJul 28, 2009Foreign Exchange Management (Transfer or Issue of Any Foreign Security) (Fourth Amendment) Regulations, 2009 20 kbFirst Quarter Review of Statement on Monetary Policy for the Year 2009-10 800 kbJul 24, 2009Auction of Government of India Dated Securities 22 kbAuction for Sale (Re-issue ) of ‘6.07 per cent Government Stock, 2014’ 19 kbAuction for Sale (Re-issue ) of ‘7.94 per cent Government Stock, 2021’ 18 kbAuction for Sale (Re-issue ) of ‘8.24 per cent Government Stock, 2027' 16 kbJul 22, 2009Iss