In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline, AutoModelForCausalLM, AutoModelForQuestionAnswering, AutoModel
from langchain import HuggingFacePipeline, HuggingFaceHub
from langchain.document_loaders import UnstructuredURLLoader, UnstructuredPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
import pickle

from glob import glob

from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
### RetrievalQA With Local HuggingFace Model

In [4]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-xl", 
                                              min_length=10, 
                                            #   max_length=200
                                              )
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl")
pipe = pipeline(task="text2text-generation", model=model, tokenizer=tokenizer)
llm = HuggingFacePipeline(pipeline=pipe)

# llm = HuggingFaceHub(repo_id="google/flan-t5-xxl")

# llm = HuggingFacePipeline.from_model_id("google/flan-t5-xl", task="text2text-generation")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
urls = ["https://python.langchain.com/docs/get_started/introduction"]
loader = UnstructuredURLLoader(urls = urls)
documents = loader.load()

# pdf_paths = glob("path/to/pdfs/*")
# documents = []
# for path in pdf_paths:
#     loader = UnstructuredPDFLoader(file_path=path)
#     documents.extend(loader.load())



In [9]:
len(documents)

1

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=300,
    chunk_overlap=75
)
texts = text_splitter.split_documents(documents)

In [8]:
len(texts)

13

In [9]:
texts[0]

Document(page_content='Get started\n\nIntroduction\n\nIntroduction\n\nLangChain is a framework for developing applications powered by language models. It enables applications that are:\n\nData-aware: connect a language model to other sources of data\n\nAgentic: allow a language model to interact with its environment', metadata={'source': 'https://python.langchain.com/docs/get_started/introduction'})

In [10]:
##Load Embedding Model to Create Vectors from Documents
embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large",
    model_kwargs={"device":"cpu"}
)

load INSTRUCTOR_Transformer
max_seq_length  512


In [11]:
#This line can take a while to run. Will save vector store in next cell so it can be loaded in from disk in subsequent runs
vector_store = FAISS.from_documents(texts,embeddings)

In [12]:
with open("vector_store.db", "wb") as f:
    pickle.dump(vector_store, f)

In [5]:
with open("vector_store.db", "rb") as f:
    vector_store = pickle.load(f)

In [6]:
qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=vector_store.as_retriever()
)

In [7]:
## Modify this query to be relevant to the documents stored in the vector store
query = "What is Langchain?"

In [8]:
vector_store.search(query, "similarity")

[Document(page_content='Get started\n\nIntroduction\n\nIntroduction\n\nLangChain is a framework for developing applications powered by language models. It enables applications that are:\n\nData-aware: connect a language model to other sources of data\n\nAgentic: allow a language model to interact with its environment', metadata={'source': 'https://python.langchain.com/docs/get_started/introduction'}),
 Document(page_content='Guides\u200b\n\nLearn best practices for developing with LangChain.\n\nEcosystem\u200b\n\nLangChain is part of a rich ecosystem of tools that integrate with our framework and build on top of it. Check out our growing list of integrations and dependent repos.\n\nAdditional resources\u200b', metadata={'source': 'https://python.langchain.com/docs/get_started/introduction'}),
 Document(page_content='Community\u200b\n\nHead to the Community navigator to find places to ask questions, share feedback, meet other developers, and dream about the future of LLM’s.\n\nAPI refer

In [9]:
## These results include the knowledge base
qa.run(query)



'a framework for developing applications powered by language models'

In [10]:
## These results do not include the knowledge base (just raw input/output with model)
llm.predict(query)

'a chain of arithmetical operations'

In [14]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration
from datasets import load_dataset

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [21]:
input_dict = tokenizer.prepare_seq2seq_batch("who holds the record in 100m freestyle", return_tensors="pt") 

generated = model.generate(input_ids=input_dict["input_ids"]) 
print(tokenizer.batch_decode(generated, skip_special_tokens=True)[0]) 

 michael phelps
