In [39]:
from langchain_community.llms import Ollama
from langchain_community.document_loaders import CSVLoader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain import hub
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers.string import StrOutputParser

In [7]:
llm = Ollama(model="llama3")
llm

Ollama(model='llama3')

In [27]:
loader = CSVLoader("data/recount3.csv")
docs = loader.load()
len(docs)

18830

In [52]:
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.split_documents(docs)
len(documents)

4

In [53]:
documents[:2]

[Document(page_content='organism: human\nproject_home: data_sources/sra\nproject: SRP179061\nn_samples: 113\nstudy_title: Alzheimer\'s gene expression by cell type - SFG\nstudy_abstract: AD patients all had Braak stages V or VI, and were also pathologically confirmed to have amyloid plaque. The "SAMPLE_ID" sample characteristic is a sample identifier internal to Genentech. The ID of this project in Genentech\'s ExpressionPlot database is PRJ0018621 Overall design: RNA from purified cell types from AD and control post-mortem frozen superior frontal gyrus of AD and control patients.', metadata={'source': 'data/recount3.csv', 'row': 859}),
 Document(page_content='organism: human\nproject_home: data_sources/sra\nproject: SRP100948\nn_samples: 117\nstudy_title: Heterogeneity in neurodegenerative disease\nstudy_abstract: RNA was purified from fusiform gyrus tissue sections of autopsy-confirmed Alzheimer\'\'s cases and neurologically normal age-matched controls. The "SAM.ID" sample characteri

In [47]:
vector_store = FAISS.from_documents(documents, GPT4AllEmbeddings())
retriever = vector_store.as_retriever()
docs = vector_store.similarity_search("I want to find the available data to study Alzheimer's disease.")

len(docs)

4

In [54]:
prompt = hub.pull("rlm/rag-prompt")

In [55]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [56]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [60]:
rag_chain.invoke("recommend available 3 datasets that sequenced BV-173 cell-line. And Please tell the project id and the number of samples.")

'Based on the provided context, I recommend three available datasets that sequenced BV-173 cell-line. \n\n1. SRP106526 - This dataset has 6 samples from a study titled "Homo sapiens Cell line Raw sequence reads". \n2. SRP017465 - This dataset has 23 samples from a study titled "RNA sequencing results" focused on human cell lines.\n3. SRP028528 - This dataset has 25 samples from a study titled "Homo sapiens Transcriptome or Gene expression", which includes data from chronic phase and blast crisis CML, normal cord blood cells, and transduced lentiviral vectors.\n\nThese datasets are available at the SRA (Sequence Read Archive) under the project IDs mentioned.'

In [70]:
vector_store.save_local("index/recount3.index")