In [42]:
import os
import openai
# import deeplake 
from dotenv import load_dotenv

from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [43]:
load_dotenv(os.getcwd()+'/keys.env')
activeloop_token = os.getenv('ACTIVELOOP_TOKEN')
deeplake_username = os.getenv('DEEPLAKE_USERNAME')
openai.api_key = os.environ.get('OPENAI_API_KEY')

pdfdirname = os.getcwd()+'/../LangChainQueryTexts/ThesisPapers'
pdfdirname = os.path.abspath(pdfdirname)

In [45]:
def load_docs(root_dir):
    docs = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for file in filenames:
            print(file)
            try:
                loader = PyPDFLoader(os.path.join(
                    dirpath, file))
                docs.extend(loader.load_and_split())
            except Exception as e:
                print(e)
                pass
    return docs


def split_docs(docs):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    return text_splitter.split_documents(docs)


def main(root_dir, deep_lake_path):
    docs = load_docs(root_dir)
    texts = split_docs(docs)
    embeddings = OpenAIEmbeddings()
    
    db = DeepLake(dataset_path=deep_lake_path, embedding_function=embeddings)
    
    db.add_documents(texts)

## Load, split, and embed text

In [46]:
docs = load_docs(pdfdirname)
texts = split_docs(docs)
texts

GeneticAlgorithm.pdf
Sensitivity_analysis_of_autonomous_oscillations_ap.pdf
SelfAssemblyOscillations.pdf
Osman.pdf
TunableOscillator.pdf
DesignPrinciplesOscillators.pdf


[Document(page_content='Pušniketal. JournalofBiologicalEngineering           (2019) 13:75 \nhttps://doi.org/10.1186/s13036-019-0205-0\nMETHODOLOGY OpenAccess\nComputationalanalysisofviable\nparameterregionsinmodelsofsynthetic\nbiologicalsystems\nŽigaPušnik,MihaMraz,NikolajZimicandMihaMoškon*\nAbstract\nBackground: Gene regulatorynetworkswithdifferenttopologicaland/ordynamicalpropertiesmightexhibitsimilar\nbehavior.Systemthatislessperceptivefortheperturbationsofitsinternalandexternalfactorsshouldbepreferred.\nMethodsforsensitivityandrobustnessassessmenthavealreadybeendevelopedandcanberoughlydividedinto\nlocalandglobalapproaches.Localmethodsfocusonlyonthelocalareaaroundnominalparametervalues.Thiscan\nbeproblematicwhenparametersexhibitsthedesiredbehavioroveralargerangeofparameterperturbationsor\nwhenparametervaluesareunknown.Globalmethods,ontheotherhand,investigatethewholespaceofparameter\nvaluesandmostlyrelyondifferentsamplingtechniques.Thiscanbecomputationallyinefficient.Toaddressthese\

In [47]:
embeddings = OpenAIEmbeddings()
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='2022-12-01', openai_api_base=None, openai_api_type=None, embedding_ctx_length=8191, openai_api_key=None, openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6)

In [71]:
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

## Create Chroma vectorstore for pdfs

In [74]:
doc_retriever = Chroma.from_documents(texts, embeddings).as_retriever() #initialize vectorstore into retriever

Using embedded DuckDB without persistence: data will be transient


In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor()

## Load chat model

In [75]:
chat = ChatOpenAI(model_name="gpt-4", streaming=True, callbacks=[StreamingStdOutCallbackHandler()])

In [76]:
qa = RetrievalQA.from_chain_type(llm=chat, chain_type="stuff", retriever=doc_retriever, return_source_documents=True)

In [77]:
query = "How does membrane localization effect biochemical kinetics?"
result = qa({"query": query})
# qa.run(query)

Membrane localization affects biochemical kinetics in several ways. One significant influence is on the binding affinities and association/dissociation rates of proteins. As shown in the provided context, protein-lipid affinities (KaPM) are often critical in controlling the overall time-scales of complexation, even driving slowdowns in speeds relative to solution binding. The change in binding strengths, rather than the slower 2D diffusion on the membrane, primarily determines assembly speeds in physiological rate regimes.

Moreover, the change from 3D to 2D interactions on the membrane surface reduces the search space for protein-protein interactions, which can lead to enhanced complex formation and faster assembly. However, the influence of diffusion on reaction rates is rarely a dominant factor in physiological rate regimes.

It is important to note that as spatial dimensions increase, times to diffuse and reach the membrane will influence the overall equilibration times. Additional

In [80]:
display(result["result"])

'Membrane localization affects biochemical kinetics in several ways. One significant influence is on the binding affinities and association/dissociation rates of proteins. As shown in the provided context, protein-lipid affinities (KaPM) are often critical in controlling the overall time-scales of complexation, even driving slowdowns in speeds relative to solution binding. The change in binding strengths, rather than the slower 2D diffusion on the membrane, primarily determines assembly speeds in physiological rate regimes.\n\nMoreover, the change from 3D to 2D interactions on the membrane surface reduces the search space for protein-protein interactions, which can lead to enhanced complex formation and faster assembly. However, the influence of diffusion on reaction rates is rarely a dominant factor in physiological rate regimes.\n\nIt is important to note that as spatial dimensions increase, times to diffuse and reach the membrane will influence the overall equilibration times. Addit

In [69]:
result["source_documents"]

[Document(page_content='Weapply thetheory here tocharacterizing, within aquantitative framework, therole of\nmembrane localization forenhancing 55binding interactions involving 33distinct protein\npair interaction sets(S1Table). Through simulation, wealso move beyond themodel illus-\ntrated inFig1ofonly pairs ofsoluble binding partners toshow how complexation involving\nnon-membrane binding scaffold proteins such asclathrin, orhow formation ofhigher-order\noligomers, which isfunctionally important fordriving membrane remodeling [11, 17,18], can\nalso beregulated bymembrane localization (13additional interaction setsinS2Table). Our\ntheory only applies tothepair interactions illustrated inFig1.Weinclude 22proteins involved\ninCME inboth human and yeast cells, aswell as15proteins involved inlipid regulation, vesi-\ncleformation onendosomes, budding, and morphogenesis inyeast cells (Table 1).Wecol-\nlected concentration and cellular geometry data based oninvivo values tobetter connect to\

In [None]:
from langchain import PromptTemplate, LLMChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate