In [1]:
!pip -q install langchain huggingface_hub openai google-search-results tiktoken chromadb rank_bm25 faiss-cpu

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.0/798.0 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.4/225.4 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m51.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.5/215.5 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.4/48.4 kB[0m [31m3.1 M

In [25]:
import os
os.environ["OPENAI_API_KEY"] = "sk-xxx"

from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.schema import Document

from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

from langchain.embeddings.openai import OpenAIEmbeddings
embedding = OpenAIEmbeddings()

# Hybrid Search

## BM25 Retriever - Sparser retriever

In [4]:
doc_list = [
    "I like apples",
    "I like oranges",
    "Apples and oranges are fruits",
    "I like computers by Apple",
    "I love fruit juice"
]

In [5]:
bm25_retriever = BM25Retriever.from_texts(doc_list)
bm25_retriever.k = 2

In [10]:
bm25_retriever.dict

<bound method BaseModel.dict of BM25Retriever(vectorizer=<rank_bm25.BM25Okapi object at 0x79e7b0851480>, docs=[Document(page_content='I like apples'), Document(page_content='I like oranges'), Document(page_content='Apples and oranges are fruits'), Document(page_content='I like computers by Apple'), Document(page_content='I love fruit juice')], k=2)>

In [8]:
bm25_retriever.get_relevant_documents("Apple")

[Document(page_content='I like computers by Apple'),
 Document(page_content='I love fruit juice')]

In [9]:
bm25_retriever.get_relevant_documents("a green fruit")

[Document(page_content='I love fruit juice'),
 Document(page_content='I like computers by Apple')]

## Embeddings - Dense retrievers FAISS

In [17]:
faiss_store = FAISS.from_texts(doc_list, embedding)
faiss_retriever = faiss_store.as_retriever(search_kwargs={"k": 2})

In [19]:
faiss_retriever.get_relevant_documents("A green fruit")

[Document(page_content='Apples and oranges are fruits'),
 Document(page_content='I like apples')]

## Ensemble Retriever

In [20]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, faiss_retriever], weights=[0.5, 0.5])
docs = ensemble_retriever.get_relevant_documents("A green fruit")
docs

[Document(page_content='I love fruit juice'),
 Document(page_content='Apples and oranges are fruits'),
 Document(page_content='I like computers by Apple'),
 Document(page_content='I like apples')]

In [21]:
docs = ensemble_retriever.get_relevant_documents("Apple Phones")
docs

[Document(page_content='I like computers by Apple'),
 Document(page_content='I love fruit juice'),
 Document(page_content='I like apples')]

In [22]:
faiss_retriever.get_relevant_documents("Apple Phones")

[Document(page_content='I like computers by Apple'),
 Document(page_content='I like apples')]

In [24]:
bm25_retriever.get_relevant_documents("Apple Phones")

[Document(page_content='I like computers by Apple'),
 Document(page_content='I love fruit juice')]