In [1]:
import os
import openai
# import deeplake 
from dotenv import load_dotenv

from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
load_dotenv(os.getcwd()+'/keys.env')
activeloop_token = os.getenv('ACTIVELOOP_TOKEN')
deeplake_username = os.getenv('DEEPLAKE_USERNAME')
openai.api_key = os.environ.get('OPENAI_API_KEY')

pdfdirname = os.getcwd()+'/../LangChainQueryTexts/ThesisPapers'
pdfdirname = os.path.abspath(pdfdirname)

In [3]:
def load_docs(root_dir):
    docs = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for file in filenames:
            print(file)
            try:
                loader = PyPDFLoader(os.path.join(
                    dirpath, file))
                docs.extend(loader.load_and_split())
            except Exception as e:
                print(e)
                pass
    return docs


def split_docs(docs):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    return text_splitter.split_documents(docs)


def main(root_dir, deep_lake_path):
    docs = load_docs(root_dir)
    texts = split_docs(docs)
    embeddings = OpenAIEmbeddings()
    
    db = DeepLake(dataset_path=deep_lake_path, embedding_function=embeddings)
    
    db.add_documents(texts)

## Load, split, and embed text

In [4]:
docs = load_docs(pdfdirname)
texts = split_docs(docs)

MyDraft1.pdf
SelfAssemblyOscillations.pdf
Osman.pdf
Sensitivity_analysis_of_autonomous_oscillations_ap.pdf
DesignPrinciplesOscillators.pdf
TunableOscillator.pdf
GeneticAlgorithm.pdf


In [5]:
embeddings = OpenAIEmbeddings()
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='2022-12-01', openai_api_base=None, openai_api_type=None, embedding_ctx_length=8191, openai_api_key=None, openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6, request_timeout=None)

## Create Chroma vectorstore for pdfs

In [6]:
doc_retriever = Chroma.from_documents(texts, embeddings).as_retriever() #initialize vectorstore into retriever

Using embedded DuckDB without persistence: data will be transient


In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor()

## Load chat model

In [9]:
chat = ChatOpenAI(model_name="gpt-3.5-turbo", streaming=True, callbacks=[StreamingStdOutCallbackHandler()])

In [20]:
qa = RetrievalQA.from_chain_type(llm=chat, chain_type="map_reduce", retriever=doc_retriever, return_source_documents=True, verbose=True)

In [21]:
query = "How does membrane localization effect biochemical kinetics?"
result = qa({"query": query})
# qa.run(query)



[1m> Entering new RetrievalQA chain...[0m
Membrane localization can generate nonlinear feedback by selectively accelerating reactions and assembly through enhancing the effective binding affinity of reactants on the membrane. This effect is explained by both a reduced diffusive search space in 2D vs 3D, as well as increased effective concentration with the same reactant copy numbers in the lower dimensional space. The effective binding enhancement has been mathematically modeled in recent work as a function of the volume to surface area ratio (V/A) of the system, which scales the forward kinetics of biomolecular reactions. According to their analysis using a physiological realistic range of cell V/A, bimolecular kinetics can be enhanced by multiple orders of magnitude compared to their counterparts in 3D diffusive space. Membrane localization is not a static binding enhancement, but rather is regulated by the activity of lipid-modifying enzymes, allowing for spatiotemporal control 

In [19]:
result

{'query': 'How does membrane localization effect biochemical kinetics?',
 'result': 'Membrane localization can enhance biochemical kinetics by selectively accelerating reactions and assembly through a reduced diffusive search space in 2D vs 3D, as well as increased effective concentration with the same reactant copy numbers in the lower dimensional space. This enhancement has been mathematically modeled as a function of the volume to surface area ratio (V/A) of the system, which scales the forward kinetics of biomolecular reactions. Membrane localization is not a static binding enhancement, but rather is regulated by the activity of lipid-modifying enzymes, allowing cells to have spatiotemporal control of specific membrane-protein affinities to enable encoding of complex molecular logic and spatiotemporal memory in the lipid membrane.',
 'source_documents': [Document(page_content='2006), there exists the potential for nonlinear feedback mechanisms between enzymatic activity and PIP met

In [18]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

In [19]:
memory=ConversationBufferMemory()

In [20]:
convo_qa = ConversationalRetrievalChain.from_llm(chat, doc_retriever,verbose=True, memory=memory)

In [23]:
convo_qa.run(input=query)

ValueError: Missing some input keys: {'chat_history', 'question'}

In [None]:
from langchain import PromptTemplate, LLMChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate