# Installing Packages

In [13]:
!pip -q install pypdf langchain_community chromadb sentence-transformers einops langchain_openai

# Loading PDF

In [14]:
from langchain_community.document_loaders import PyPDFDirectoryLoader

In [15]:
loader = PyPDFDirectoryLoader("./articles")

In [16]:
docs = loader.load()

In [17]:
len(docs)

1

# Split and Tokenize Documents

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from transformers import AutoTokenizer


In [19]:
chunk_size = 300

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(AutoTokenizer.from_pretrained("bert-base-uncased"),
                                                                          chunk_size=chunk_size,
                                                                          chunk_overlap=int(chunk_size/10))

chunks = text_splitter.split_documents(docs)

# Creating Embedding and Storing It In Vector Store

In [20]:
from langchain_community.embeddings import HuggingFaceEmbeddings

embed_model_name = "nomic-ai/nomic-embed-text-v1"
model_kwargs = {"device": "cpu", "trust_remote_code": True}
encode_kwargs = {
    "normalize_embeddings": False,
}

instructor_embeddings = HuggingFaceEmbeddings(model_name=embed_model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs)

<All keys matched successfully>


In [21]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory="./chroma_db"

embedding = instructor_embeddings

vectordb = Chroma.from_documents(documents=chunks,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

vectordb.persist()

  warn_deprecated(


In [22]:
# Retrieves 3 most relevant documents

retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [23]:
question = "What is yen kai experienced in?"

In [24]:
relevant_docs = retriever.invoke(question)

In [25]:
context = "\n\n".join(relevant_docs.page_content for relevant_docs in relevant_docs)

# OpenAI Model

In [26]:
# from getpass import getpass

# OPENAI_API_KEY = getpass()


In [27]:
# import os

# os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [28]:
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

template = """You are a QnA chatbot. Answer the question based on the context below.
If the answer cannot be answered using the provided context, answer with "I don't know"

Context: {context}
Question: {question} """

prompt = PromptTemplate(
    input_variables=["question", "context"],
    template=template)

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0, max_tokens=256)

In [29]:
llm_chain = prompt | llm

In [30]:
inputs = {"question": question, "context": context}

generate = llm_chain.invoke(inputs)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [31]:
generate.content

'Lim Yen Kai is experienced in Python, Tensorflow, Keras, PyTorch, Scikit-Learn, Apache Spark, Databricks, Matlab, LLM, Prompt Engineering, Hugging Face, Azure, GCP, AWS, Ansible, OpenShift, Docker, Kubernetes, Git, Hadoop, SQL, Bootstrap, Django, Flask, Elastic, JavaScript, Java, C#, Frontend, Backend, Full-Stack, TypeScript, Angular, React, Dart, Flutter, Xamarin, DevOps, MLOps, English, Mandarin, Malay, and Cantonese.'