In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
import os
from getpass import getpass

OPENAI_API_KEY = getpass()

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

embedding = OpenAIEmbeddings()


# Load pdf
loader = PyPDFLoader("E:\\langchain_RAG\\data\\baichuan.pdf")
data = loader.load()

# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
splits = text_splitter.split_documents(data[:6])


In [6]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever

In [7]:
bm25_retriever = BM25Retriever.from_documents(
    documents=splits
)
bm25_retriever.k = 4

In [8]:
vectordb = Chroma.from_documents(documents=splits, embedding=embedding)

retriever = vectordb.as_retriever(search_kwargs={"k": 4})

In [9]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, retriever], weights=[0.5, 0.5]
)

In [10]:
docs = ensemble_retriever.invoke("What is baichuan2 ？")
docs

[Document(page_content='Baichuan 2 excels in vertical domains such\nas medicine and law. We will release all\npre-training model checkpoints to benefit the\nresearch community in better understanding\nthe training dynamics of Baichuan 2.\n1 Introduction\nThe field of large language models has witnessed\npromising and remarkable progress in recent years.\nThe size of language models has grown from\nmillions of parameters, such as ELMo (Peters\net al., 2018), GPT-1 (Radford et al., 2018), to', metadata={'page': 0, 'source': 'E:\\langchain_RAG\\data\\baichuan.pdf'}),
 Document(page_content='daniel@baichuan-inc.com.ChatGPT (OpenAI, 2022) from OpenAI, the power\nof these models to generate human-like text has\ncaptured widespread public attention. ChatGPT\ndemonstrates strong language proficiency across\na variety of domains, from conversing casually to\nexplaining complex concepts. This breakthrough\nhighlights the potential for large language models\nto automate tasks involving natural la