In [1]:
# Install dependencies
!pip install -q openai langchain "unstructured[all-docs]" chromadb tiktoken

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.8/292.8 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m525.5/525.5 kB[0m [31m34.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m50.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.1/289.1 kB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━

In [2]:
from google.colab import userdata
import os
from openai import OpenAI

# Let's make sure LLMPROXY_JWT is defined.
assert userdata.get("LLMPROXY_JWT")

os.environ['OPENAI_API_KEY'] = f"{userdata.get('LLMPROXY_JWT')}:langchain-workshop"
os.environ['OPENAI_API_BASE'] = 'https://llmfoundry.straive.com/v1/'

In [10]:
!pip install pypdf



In [11]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("/content/drive/MyDrive/GenAI/RAG_Langchain/budget_speech.pdf")
pages = loader.load_and_split()


In [36]:
pages[1]

Document(page_content='CONTENTS  \n \nPART – A \n Page No.  \nIntroduction  1 \nInclusive Development and Growth  2 \nSocial Justice   3  \nExemplary  Track Record of Governance,  \nDevelopment and Performance (GDP)  7 \nEconomic Management  8 \nGlobal Context  9 \nVision for ‘Viksit Bharat’  10 \nStrategy for  ‘Amrit Kaal’  11 \nInfrastructure Development  17 \nAmrit Kaal as Kartavya Kaal  22 \nRevised Estimates 2023 -24 23 \nBudget Estimates 2024 -25 23 \nPART – B \nDirect taxes  25 \nIndirect Taxes   26 \nEconomy – Then and Now  28', metadata={'source': '/content/drive/MyDrive/GenAI/RAG_Langchain/budget_speech.pdf', 'page': 2})

In [13]:
len(pages)

30

In [15]:
# We need to split the content into manageable chunks for embeddings.
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(pages)

In [21]:
len(docs)

87

In [22]:
# We then convert the split documents into embeddings.
# We'll store these in ChromaDB for retrieval.
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embedding = OpenAIEmbeddings()
store = Chroma(embedding_function=embedding, persist_directory="/content/drive/MyDrive/GenAI/RAG_Langchain/chroma_db_oai")

# Here are other text embedding models
# https://python.langchain.com/docs/integrations/text_embedding

# Here are other vector stores
# https://python.langchain.com/docs/modules/data_connection/vectorstores/
# https://python.langchain.com/docs/integrations/vectorstores

In [26]:
# We'll use the SQLRecordManager to re-compute embeddings only for changed docs
from langchain.indexes import SQLRecordManager, index
source = 'budget'
namespace = f'chromadb-{source}'

record_manager = SQLRecordManager(namespace, db_url=f'sqlite:///records.sql')
record_manager.create_schema()

# Now let's run this
index(docs, record_manager, store, cleanup="full", source_id_key='source')

# Remember: You only need to run this block of code if documents may've changed.

{'num_added': 87, 'num_updated': 0, 'num_skipped': 0, 'num_deleted': 0}

In [27]:
# The vector store can give us relevant documents. For example:
retriever = store.as_retriever(search_kwargs={'k': 1})
retriever.get_relevant_documents("What is Vision for ‘Viksit Bharat’?")

# Here are more retrievers (ways of getting docs from a query)
# https://python.langchain.com/docs/modules/data_connection/retrievers/
# https://python.langchain.com/docs/integrations/retrievers

[Document(page_content='Vision  for ‘Viksit Bharat’  \n32. Our vision for ‘Viksit Bharat’ is that of “Prosperous Bharat \nin harmony with nature, with modern infrastructure, and \nproviding opportunities for all citizens and all regions to reach \ntheir potential”.  \n33. With confidence arising from stron g and exemplary track -\nrecord of performance and progress earning ‘Sabka Vishwas’ , the \nnext five years will be years of unprecedented development, and \ngolden moments to realize the dream of developed India  @', metadata={'page': 13, 'source': '/content/drive/MyDrive/GenAI/RAG_Langchain/budget_speech.pdf'})]

In [30]:
retriever.get_relevant_documents("living in rented houses")

[Document(page_content='13 \n c. Entrepreneurship opportunities for a large number of \nvendors for supply and installation;  \nd. Employment opportunities for the youth with technical \nskills i n manufacturing, installation and maintenance;  \nHousing for middle class  \n44. Our Government will launch a scheme to help deserving \nsections of the middle class “living in rented houses, or slums, or \nchawls and unauthorized colonies” to buy or build their own \nhouses.   \nMedical Colleges', metadata={'page': 16, 'source': '/content/drive/MyDrive/GenAI/RAG_Langchain/budget_speech.pdf'})]

In [31]:
# We'll pass these results and the question to ChatGPT
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=1)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever, return_source_documents=True)

# Here are more chat models
# https://python.langchain.com/docs/integrations/chat/
# Chains are a huge topic that I haven't explored yet

  warn_deprecated(


In [33]:
qa_chain("how many crores allocated for Infrastructure Development?")

{'query': 'how many crores allocated for Infrastructure Development?',
 'result': '11,11,111 crore rupees have been allocated for Infrastructure Development, which is 3.4 per cent of the GDP.',
 'source_documents': [Document(page_content='18 \n eleven thousand, one hundred and eleven crore rupees  \n(` 11,11,111 crore). This would be 3.4 per cent of the GDP.  \nRailways  \n62. Three major economic railway corridor programmes will \nbe implemented. These are : \n(1) energy, mineral and cement corridors,  \n(2) port connectivity corridors, and  \n(3) high traffic density corridors.  \nThe projects have been identified under the PM Gati Shakti for \nenabling multi -modal connectivity. They will improve logistics \nefficiency and reduce cost.', metadata={'page': 21, 'source': '/content/drive/MyDrive/GenAI/RAG_Langchain/budget_speech.pdf'})]}

In [35]:
qa_chain("what are vission in Aviation Sector?")

{'query': 'what are vission in Aviation Sector?',
 'result': "I don't have specific information on the vision in the Aviation Sector regarding the comfort of passengers.",
 'source_documents': [Document(page_content='comfort  of passengers.   \nAviation Sector  \n65. The aviation sector has been galvanized in the past  \nten years. Number of airports have  doubled to 149. Roll out of \nair connectivity to tier -two and tier -three cities under UDAN', metadata={'page': 21, 'source': '/content/drive/MyDrive/GenAI/RAG_Langchain/budget_speech.pdf'})]}