In [4]:
!pip install langchain
!pip install pypdf
!pip install sentence-transformers
!pip install accelerate
!pip install chromadb
!pip install "unstructured[all-docs]"



Collecting langchain
  Using cached langchain-0.1.0-py3-none-any.whl (797 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Using cached dataclasses_json-0.6.3-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Using cached jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.9 (from langchain)
  Using cached langchain_community-0.0.9-py3-none-any.whl (1.5 MB)
Collecting langchain-core<0.2,>=0.1.7 (from langchain)
  Using cached langchain_core-0.1.7-py3-none-any.whl (214 kB)
Collecting langsmith<0.1.0,>=0.0.77 (from langchain)
  Using cached langsmith-0.0.77-py3-none-any.whl (48 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)
  Using cached marshmallow-3.20.1-py3-none-any.whl (49 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)
  Using cached typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting jsonpointer>=1.9 (from jsonpatch<2.0,>=1

In [1]:
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline
import torch
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader
from langchain_community.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from google.colab import files
import os

uploaded_file = files.upload()

# Get the file name
file_name = list(uploaded_file.keys())[0]

# Create a temporary directory
temp_dir = "temp"
os.makedirs(temp_dir, exist_ok=True)

# Save the uploaded file to the temporary directory
temp_path = os.path.join(temp_dir, file_name)
# temp_path = "/content/temp/return.pdf"
with open(temp_path, "wb") as temp_file:
    temp_file.write(uploaded_file[file_name])

# Process the uploaded file
loader = UnstructuredFileLoader(temp_path)
documents = loader.load()
for document in documents:
    print(document.page_content)
# We cant load the whole pdf into the program so we split the pdf into chunks
# We use RecursiveCharacterTextSplitter to split the pdf into chunks
# Each chunk is 500 characters long and the chunks overlap by 200 characters (You can change this according to your needs)
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=200)
texts = text_splitter.split_documents(documents)

# We use SentenceTransformerEmbeddings to embed the text chunks
# Embeddings are used to find the similarity between the query and the text chunks
# We use multi-qa-mpnet-base-dot-v1 model to embed the text chunks
# We need to save the embeddings to disk so we use persist_directory to save the embeddings to disk
embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")
persist_directory = "/content/chroma/"

# Chroma is used to store the embeddings
# We use from_documents to store the embeddings
# We use the persist_directory to save the embeddings to disk
db = Chroma.from_documents(texts, embeddings, persist_directory=persist_directory)

# To save and load the saved vector db (if needed in the future)
# Persist the database to disk
# db.persist()
# db = Chroma(persist_directory="db", embedding_function=embeddings)

checkpoint = "MBZUAI/LaMini-Flan-T5-783M"

# Initialize the tokenizer and base model for text generation
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint,
    device_map="auto",
    torch_dtype=torch.float32
)

pipe = pipeline(
    'text2text-generation',
    model = base_model,
    tokenizer = tokenizer,
    max_length = 512,
    do_sample = True,
    temperature = 0.3,
    top_p= 0.95
)

# Initialize a local language model pipeline
local_llm = HuggingFacePipeline(pipeline=pipe)
# Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type='stuff',
    retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
    return_source_documents=True,
)

while True:
  input_query = str(input("Enter your query:"))
  llm_response = qa_chain({"query": input_query})
  print(llm_response['result'])


Saving return.pdf to return.pdf


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Electronic Reservation Slip (ERS)-Normal User

Booked From

To

VIJAYAWADA JN - BZA (VIJAYAWADA) Start Date* 03-Jan-2024

VIJAYAWADA JN (BZA)

Departure* 12:45 03-Jan-2024

LOKMANYATILAK T - LTT (MUMBAI) Arrival* 11:00 04-Jan-2024

PNR 4605324563 Quota Person with disability (HP) Passenger Details # Name 1. G K CHAITANYA Person With Disability Details # Concession Code

Age 19

Train No./Name 17221 / COA LTT EXPRESS Distance 1086 KM

Gender M

Booking Status CNF/B1/1/LOWER

Class THIRD AC (3A) Booking Date 25-Nov-2023 22:11:34 HRS

Current Status CNF /B1/1/LOWER

Registration No.

ID Card Type

ID Card No.

1. HNDCAP

SCBZA20164

Aadhaar ID

262664223324

Acronyms:

RLWL: REMOTE LOCATION WAITLIST

PQWL: POOLED QUOTA WAITLIST

RSWL: ROAD-SIDE WAITLIST

Transaction ID: 100004613511453

IR recovers only 57% of cost of travel on an average. Payment Details

Ticket Fare

₹ 375.00

IRCTC Convenience Fee (Incl. of GST)

₹ 35.40

Travel Insurance Premium (Incl. of GST)

₹ 0.35

Total Fare (all

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


.gitattributes:   0%|          | 0.00/737 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/8.66k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.9k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/860 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Enter your query:Name


  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(


The name of the person with disability is G K CHAITANYA.
Enter your query:Booked From  To


  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(


The train is booked from to VIJAYAWADA JN - BZA (VIJAYAWADA).
Enter your query:Total Tax:


  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(


The total tax for the train is 17.80.


KeyboardInterrupt: Interrupted by user