In [13]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.llms import GPT4All
from langchain.chains import RetrievalQA

In [14]:
# Step 1: Load the PDF Document
loader = PyPDFLoader("doc.pdf")
documents = loader.load_and_split()

In [15]:
# Step 2: Text Splitting
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=64)
texts = text_splitter.split_documents(documents)

In [16]:
import os
from dotenv import load_dotenv

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

In [17]:

# Step 3: Creating Embeddings
embeddings = OpenAIEmbeddings()

In [18]:
# Step 4: Vector Store
db = Chroma.from_documents(texts, embeddings, persist_directory="db")

In [None]:
# Step 5: Load the gpt4all Model
model_path = "./ggml-gpt4all-j-v1.3-groovy.bin"
llm = GPT4All(model=model_path, n_ctx=1000, backend="gptj", verbose=False)