# Create Vector Databse with Embedding from Research Papers

Define directories

In [16]:
import os

base_dir = os.getcwd()
sample_pdf_dir = os.path.join(base_dir, 'sample_pdfs')
persist_dir = os.path.join(base_dir, 'database')

Load PDF into arrays

In [17]:
import os
import getpass

os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')

In [41]:
import sys
print(sys.path)
sys.path.append('/Users/ruigekong/anaconda3/lib/python3.11/site-packages')

['/Applications/PyCharm.app/Contents/plugins/python/helpers-pro/jupyter_debug', '/Applications/PyCharm.app/Contents/plugins/python/helpers/pydev', '/Users/ruigekong/PycharmProjects/2023BioxAIHackathon/Web Scraping', '/Users/ruigekong/PycharmProjects/2023BioxAIHackathon', '/Users/ruigekong/anaconda3/lib/python311.zip', '/Users/ruigekong/anaconda3/lib/python3.11', '/Users/ruigekong/anaconda3/lib/python3.11/lib-dynload', '', '/Users/ruigekong/anaconda3/lib/python3.11/site-packages', '/Users/ruigekong/anaconda3/lib/python3.11/site-packages/aeosa']


In [45]:
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma

embedding = OpenAIEmbeddings()
loader = PyPDFLoader(os.path.join(sample_pdf_dir,'combined_research_pdf_sample.pdf'),extract_images=False)
pdfs = loader.load()
print(f'page 0: {pdfs[0].page_content}')
db = Chroma.from_documents(documents=pdfs,
                           embedding=embedding,
                           persist_directory=persist_dir)

page 0: Resource
Comparative Analysis and Reﬁnement of Human
PSC-Derived Kidney Organoid Differentiation withSingle-Cell Transcriptomics
Graphical Abstract
Highlights
dTwo human kidney organoid protocols were compared by
single-cell transcriptomics
dBoth protocols generated 10%–20% non-renal cells
dKidney organoid cells are immature compared with fetal andadult human kidney
dInhibiting BDNF-NTRK2 signaling reduces off-target celltypes by 90%Authors
Haojia Wu, Kohei Uchimura,Erinn L. Donnelly, Yuhei Kirita,Samantha A. Morris,Benjamin D. Humphreys
Correspondence
humphreysbd@wustl.edu
In Brief
A strong characterization of cell types,lineages, and differentiation statespresent in human PSC-derived kidneyorganoids is critical to improvedifferentiation protocols. Wu et al. usesingle-cell transcriptomics to revealnon-renal cell types, describelineage-speciﬁc expression of regulatorygenes, and report a broadly applicablestrategy to reduce off-target cellpopulations.
Wu et al., 2018, Cell Stem 

We can then persist the database to disk

In [22]:
db.persist()
db = None

And we can load the persisted databse from disk every time

In [23]:
db = Chroma(persist_directory=persist_dir,
                  embedding_function=embedding)

Example Query

In [24]:
query = "What is High-throughput single-cell transcriptomics on organoids?"
docs = db.similarity_search(query)
print(docs[0].page_content)

High-throughput  single-cell  transcriptomics  on
organoids
Agnieska  Brazovskaja1,  Barbara  Treutlein1,2,3and  J  Gray  Camp1
Three-dimensional  (3D)  tissues  grown  in  culture  from  human
stem  cells  offer  the  incredible  opportunity  to  analyze  and
manipulate  human  development,  and  to  generate  patient-
speciﬁc  models  of  disease.  Methods  to  sequence  DNA  and
RNA  in  single  cells  are  being  used  to  analyze  these  so-called
‘organoid’  systems  in  high-resolution.  Single-cell
transcriptomics  has  been  used  to  quantitate  the  similarity  of
organoid  cells  to  primary  tissue  counterparts  in  the  brain,
intestine,  liver,  and  kidney,  as  well  as  identify  cell-speciﬁc
responses  to  environmental  variables  and  disease  conditions.
The  merging  of  these  two  technologies,  single-cell  genomics
and  organoids,  will  have  profound  impact  on  personalized
medicine  in  the  near  future.
Addresses
1Max  Planck  Institute  for  Evolutio

Now, we use the openai embeddings. Running the cell below would prompt us to enter the api key.

# Create retriever

In [25]:
retriever = db.as_retriever()

# Create a question answering chain

In [36]:
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    model_name='gpt-4',
    temperature=0.1
)

# Create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True,
                                  verbose=True)

In [37]:
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])


In [38]:
# Question
query = "What is an organoid?"
llm_response = qa_chain(query)
process_llm_response(llm_response)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
An organoid is a miniaturized and simplified version of an organ produced in vitro in three dimensions that shows realistic micro-anatomy. They are derived from one or a few cells from a tissue, embryonic stem cells or induced pluripotent stem cells, which can self-organize in three-dimensional culture owing to their self-renewal and differentiation capacities. Organoids are used for studying individual variations, conducting longitudinal analyses, and modeling the effects of environmental and genetic factors on development in a controlled setting.


Sources:
/Users/ruigekong/PycharmProjects/2023BioxAIHackathon/Web Scraping/sample_pdfs/combined_research_pdf_sample.pdf
/Users/ruigekong/PycharmProjects/2023BioxAIHackathon/Web Scraping/sample_pdfs/combined_research_pdf_sample.pdf
/Users/ruigekong/PycharmProjects/2023BioxAIHackathon/Web Scraping/sample_pdfs/combined_research_pdf_sample.pdf
/Users/ruigekong/PycharmProj