In [2]:
!git lfs install
!git clone https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
!git clone https://huggingface.co/google/flan-t5-large

Git LFS initialized.
Cloning into 'all-MiniLM-L6-v2'...
remote: Enumerating objects: 46, done.[K
remote: Total 46 (delta 0), reused 0 (delta 0), pack-reused 46[K
Unpacking objects: 100% (46/46), 311.33 KiB | 2.31 MiB/s, done.
Filtering content: 100% (3/3), 260.15 MiB | 48.99 MiB/s, done.
Cloning into 'flan-t5-large'...
remote: Enumerating objects: 110, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 110 (delta 0), reused 0 (delta 0), pack-reused 107[K
Receiving objects: 100% (110/110), 635.37 KiB | 15.88 MiB/s, done.
Resolving deltas: 100% (58/58), done.
Filtering content: 100% (5/5), 11.91 GiB | 50.39 MiB/s, done.


In [3]:
!pip install langchain
!pip install torch
!pip install transformers
!pip install faiss-cpu
!pip install pypdf
!pip install sentence-transformers

Collecting langchain
  Downloading langchain-0.0.312-py3-none-any.whl (1.8 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m5.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.7/1.8 MB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m1.2/1.8 MB[0m [31m11.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m13.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.1-py3-none-any.whl (27 kB)
Collecting jsonpatch<2.0,>=1.33 (

In [4]:
from langchain.document_loaders import PyPDFLoader
pdfLoader = PyPDFLoader("/home/20220802-EB-Practical_Data_Mesh.pdf") # path of your pdf file
documents = pdfLoader.load()

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
docs = text_splitter.split_documents(documents)

In [6]:
# using open source model all-MiniLM-L6-v2 for embeddings
from langchain.embeddings import HuggingFaceEmbeddings
modelPath = "/content/all-MiniLM-L6-v2"
model_kwargs = {'device':'cpu'}
encode_kwargs = {'normalize_embeddings':False}
embeddings = HuggingFaceEmbeddings(
  model_name = modelPath,
  model_kwargs = model_kwargs,
  encode_kwargs=encode_kwargs
)

In [7]:
from langchain.vectorstores import FAISS
db = FAISS.from_documents(docs, embeddings)

In [20]:
question = "What is the principles of data mesh?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

13  The Principles of Data Mesh
Data mesh is based on four main principles: data as a product, domain 
ownership, self-service, and federated governance.  While each of these 
principles is interrelated and plays an essential role, treating data as a 
product  is a fundamental shift in how organizations create, store, and 
communicate important business data. 
Data mesh moves the responsibility of providing reliable and useful access 
to data back to the data’s owner from the centralized data team. Data is no 
longer treated as an application’s byproduct, but instead is promoted as a 
first-class citizen on par with other products created and used within an 
organization. This requires a shift in responsibilities with respect to how 
data is created, modeled, and made available across an organization.


In [9]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline
from langchain import HuggingFacePipeline

tokenizer = AutoTokenizer.from_pretrained("/content/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("/content/flan-t5-large")
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

llm = HuggingFacePipeline(
    pipeline = pipe,  # pass the instance of the pipeline, not the function
    model_kwargs={"temperature": 0, "max_length": 512},
)

In [10]:
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline object at 0x7c47327824d0>, model_kwargs={'temperature': 0, 'max_length': 512})

In [14]:
from langchain.chains.question_answering import load_qa_chain

In [15]:
chain = load_qa_chain(llm, chain_type="stuff")

In [18]:
query = "What is the principles of data mesh?"
docs = db.similarity_search(query)

In [19]:
chain.run(input_documents=docs, question=query)

'data as a product, domain ownership, self-service, and federated governance'

In [None]:
from langchain.chains import RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
  llm=llm,
  chain_type="stuff",
  retriever=db.as_retriever(),
  chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)
result = qa_chain ({ "query" : question })
print(result["result"])

In [None]:
from langchain.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Keep the answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)