# Embedding

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from sentence_transformers import SentenceTransformer
import torch
bge_small = 'BAAI/bge-small-en-v1.5'  # 384 dim
bge_base = 'BAAI/bge-base-en-v1.5'  # 768 dim
bge_large = 'BAAI/bge-large-en-v1.5'  # 1024 dim

emb_bge_base = HuggingFaceBgeEmbeddings(
    model_name=bge_base,
    model_kwargs=dict(
        trust_remote_code=True,

        # SentenceTransformer's model_kwargs but need to nest model_kwargs like HuggingFaceEmbeddings
        torch_dtype=torch.bfloat16,
    ),
    query_instruction="Represent this question for searching relevant passages: ",  # By default
    embed_instruction="",  # By default
    show_progress=True
)
assert isinstance(emb_bge_base.client, SentenceTransformer)
#
# emb_model = HuggingFaceEmbeddings(
#     model_name=bge_base,
#     model_kwargs=dict(
#         trust_remote_code=True,
#         # model_kwargs will be passed to SentenceTransformer
#         model_kwargs=dict(
#             torch_dtype=torch.bfloat16,
#         )
#     ),
#     show_progress=True
# )
# assert isinstance(emb_model._client, SentenceTransformer)

  emb_bge_base = HuggingFaceBgeEmbeddings(


In [2]:
# Just test embed_query and embed_documents, don't need to call this directly
# because they need str or list[str] as input, not Document.
a = emb_bge_base.embed_query("this is my cat")
import numpy as np
np.array(a).shape

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

(768,)

## Load and split and embed documents

In [3]:
from langchain_community.document_loaders import PyPDFLoader
loader = PyPDFLoader("tsla-20221231-gen.pdf")
docs = loader.load()

In [4]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
splitted_docs = splitter.split_documents(docs)
print(len(docs), len(splitted_docs))

251 617


In [5]:
# test
chunked_texts = [doc.page_content for doc in splitted_docs]
embeddings = emb_bge_base.embed_documents(chunked_texts)

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [6]:
import numpy as np
np.array(embeddings).shape

(617, 768)

## Embedding storage

In [8]:
# docker run -e POSTGRES_USER=langchain -e POSTGRES_PASSWORD=langchain -e POSTGRES_DB=langchain -p 6024:5432 -d pgvector/pgvector:pg16
from langchain_postgres.vectorstores import PGVector, DistanceStrategy
connection = "postgresql+psycopg://langchain:langchain@localhost:6024/langchain"
db = PGVector.from_documents(splitted_docs,emb_bge_base, connection= connection,
                             distance_strategy=DistanceStrategy.COSINE)

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

In [23]:
searched_docs = db.similarity_search('what is company\' business type',k=4)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
print(searched_docs[1].page_content)

PART	I
ITEM	1.	
BUSINESS
Overview
We	design,	develop,	manufacture,	sell	and	lease	high-performance	fully	electric	vehicles	and	energy	generation	and	storage	systems,	and	offer
	
services	related	to	our	products.	We	generally	sell	our	products	directly	to	customers,	and	continue	to	grow	our	customer-facing	infrastructure	through
	
a	global	network	of	vehicle	service	centers,	Mobile	Service,	body	shops,	Supercharger	stations	and	Destination	Chargers	to	accelerate	the	widespread
	
adoption	of	our	products.	We	emphasize	performance,	attractive	styling	and	the	safety	of	our	users	and	workforce	in	the	design	and	manufacture	of	our
	
products	and	are	continuing	to	develop	full	self-driving	technology	for	improved	safety.	We	also	strive	to	lower	the	cost	of	ownership	for	our	customers
	
through	continuous	efforts	to	reduce	manufacturing	costs	and	by	offering	financial	and	other	services	tailored	to	our	products.
	
Our	mission	is	to	accelerate	the	world’s	transition	to	sustainable	energy.	We	be

## Advance
ref:https://huggingface.co/blog/matryoshka

ref https://huggingface.co/blog/embedding-quantization


# Retrieval
ref:
https://python.langchain.com/docs/how_to/multi_vector/

https://python.langchain.com/docs/concepts/retrievers/

https://python.langchain.com/docs/concepts/retrieval/

https://www.youtube.com/watch?v=gTCU9I6QqCE




## Patterns and topics
1. Search apis
2. Relational or graph database
3. Vector store
4. Ensemble (Multiple retrievers)
5. Source document retention
   * MultivectorRetriever: Creating multiple vectors for each document. Each vector could be created in a myriad of ways - examples include summaries of the text and hypothetical questions. Used If you are able to extract information from documents that you think is more relevant to index than the text itself (tables, images, summaries).
   * ParentDocumentRetriever: Indexing multiple chunks for each document, then you find the chunks that are most similar in embedding space, but you retrieve the whole parent document and return that (rather than individual chunks). Used when your pages have lots of smaller pieces of distinct information that are best indexed by themselves, but best retrieved all together.

6. RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval
7. ColBERT: Optimizing Embeddings
8.

### Emsemble Retriever

### Multivector Retriever

In [None]:
from langchain.retrievers import MultiVectorRetriever


### ParentDocumentRetriever

In [None]:
from langchain.retrievers import ParentDocumentRetriever

In [None]:
from langchain.retrievers import
from langchain_milvus.vectorstores import