In [7]:
import os
import openai
# import deeplake 
from dotenv import load_dotenv

from langchain.document_loaders import TextLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [8]:
load_dotenv(os.getcwd()+'/keys.env')
activeloop_token = os.getenv('ACTIVELOOP_TOKEN')
deeplake_username = os.getenv('DEEPLAKE_USERNAME')
openai.api_key = os.environ.get('OPENAI_API_KEY')

pdfdirname = os.getcwd()+'/../LangChainQueryTexts/ThesisPapers'
pdfdirname = os.path.abspath(pdfdirname)

In [9]:
def load_docs(root_dir):
    docs = []
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for file in filenames:
            print(file)
            try:
                loader = PyPDFLoader(os.path.join(
                    dirpath, file))
                docs.extend(loader.load_and_split())
            except Exception as e:
                print(e)
                pass
    return docs


def split_docs(docs):
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    return text_splitter.split_documents(docs)


def main(root_dir, deep_lake_path):
    docs = load_docs(root_dir)
    texts = split_docs(docs)
    embeddings = OpenAIEmbeddings()
    
    db = DeepLake(dataset_path=deep_lake_path, embedding_function=embeddings)
    
    db.add_documents(texts)

## Load, split, and embed text

In [10]:
docs = load_docs(pdfdirname)
texts = split_docs(docs)

SelfAssemblyOscillations.pdf
Osman.pdf
Sensitivity_analysis_of_autonomous_oscillations_ap.pdf
DesignPrinciplesOscillators.pdf
TunableOscillator.pdf
GeneticAlgorithm.pdf


In [11]:
embeddings = OpenAIEmbeddings()
embeddings

OpenAIEmbeddings(client=<class 'openai.api_resources.embedding.Embedding'>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='2022-12-01', openai_api_base=None, openai_api_type=None, embedding_ctx_length=8191, openai_api_key=None, openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=6)

## Create Chroma vectorstore for pdfs

In [12]:
doc_retriever = Chroma.from_documents(texts, embeddings).as_retriever() #initialize vectorstore into retriever

Using embedded DuckDB without persistence: data will be transient


In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor()

## Load chat model

In [13]:
chat = ChatOpenAI(model_name="gpt-4", streaming=True, callbacks=[StreamingStdOutCallbackHandler()])

In [14]:
qa = RetrievalQA.from_chain_type(llm=chat, chain_type="stuff", retriever=doc_retriever, return_source_documents=True)

In [15]:
query = "How does membrane localization effect biochemical kinetics?"
result = qa({"query": query})
# qa.run(query)

Membrane localization affects biochemical kinetics by influencing the time-scales of complexation and assembly of proteins. Protein-lipid affinities (KaPM) are critical in controlling the overall time-scales of complexation, even driving slowdowns in speeds relative to solution binding. Changes in diffusion from solution to the membrane, which are about 100 times slower, affect the magnitude of association and dissociation rates. However, the influence of diffusion on the reaction rates is rarely a dominant factor in physiological rate regimes, indicating that it is the binding strengths rather than slow 2D diffusion that determine assembly speeds. Membrane localization can also increase the sensitivity of complex formation to localization, acting as a trigger for assembly.

In [18]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

In [19]:
memory=ConversationBufferMemory()

In [20]:
convo_qa = ConversationalRetrievalChain.from_llm(chat, doc_retriever,verbose=True, memory=memory)

In [23]:
convo_qa.run(input=query)

ValueError: Missing some input keys: {'chat_history', 'question'}

In [None]:
from langchain import PromptTemplate, LLMChain
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate