In [1]:
import os
import langchain
import openai

api_key_path = os.path.join(
    "C:\\Users", "DXT6", "OneDrive - PETROBRAS",
    "CENPES", "TEO", "keys", "OpenAI", "apikey.txt"
)

with open(api_key_path, "r") as f:
    api_key = f.read()

In [2]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

pdfs_base = os.path.join("..", "pdfs")
chunk_size = 500
chunk_overlap = 50

In [3]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [5]:
from glob import glob

pdf_paths = glob(os.path.join(pdfs_base, "*.pdf"))[:10]

# Load PDF
loaders = [PyPDFLoader(path) for path in pdf_paths]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [6]:
splits = r_splitter.split_documents(docs)

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(openai_api_key=api_key)

In [8]:
from langchain.vectorstores import Chroma

persist_directory = os.path.join("..", "..", "data", "chroma")
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

Unable to connect optimized C data functions [No module named 'clickhouse_connect.driverc.buffer'], falling back to pure Python
Unable to connect ClickHouse Connect C to Numpy API [No module named 'clickhouse_connect.driverc.npconv'], falling back to pure Python


In [None]:
# from langchain.vectorstores import Chroma
# from langchain.embeddings.openai import OpenAIEmbeddings

# persist_directory = os.path.join("..", "..", "data", "chroma")
# embedding = OpenAIEmbeddings(openai_api_key=api_key)
# vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [9]:
print(vectordb._collection.count())

953


In [19]:
question = "What are the types of dynamic neural network?"
docs = vectordb.similarity_search(question, k=3)
len(docs)

3

In [27]:
# for doc in docs:
#     print(doc.page_content)
#     print()

In [24]:
vectordb.persist()

In [None]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]
smalldb = Chroma.from_texts(texts, embedding=embedding)
question = "Tell me about all-white mushrooms with large fruiting bodies"
smalldb.similarity_search(question, k=2)
smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)

In [25]:
question = "mixture of experts"
docs = vectordb.similarity_search(
    question,
    k=3,
    # filter={"source": os.path.join(pdfs_base, "2102.04906.pdf")}
)

In [28]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [33]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should talk about early exit neural networks",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]

In [34]:
document_content_description = "Dynamic Neural Network Survey"
llm = OpenAI(openai_api_key=api_key, temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [35]:
question = "What can change outside training in a dynamic neural network?"
docs = retriever.get_relevant_documents(question)
print(docs)



query='What can change outside training in a dynamic neural network?' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='source', value='early exit neural networks') limit=None
[]


In [36]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [37]:
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [38]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

question = "What can change outside training in a dynamic neural network?"
compressed_docs = compression_retriever.get_relevant_documents(question) # search_type = "mmr"
pretty_print_docs(compressed_docs)






#### Question Answering

In [39]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [40]:
question = "What can change outside training in a dynamic neural network?"
result = qa_chain({"query": question})
result["result"]

' The number of learning equations that the model can deliver can change outside of training in a dynamic neural network.'

##### Templates

In [41]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [42]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [44]:
# question = "What can change outside training in a dynamic neural network?"
question = "What is a mixture of experts?"
result = qa_chain({"query": question})
print(result["result"])
print(result["source_documents"][0])

 A mixture of experts is a type of machine learning model that combines multiple experts to better model the input space. It has been studied for a wide range of research and can be improved by using MLPs instead of linear networks or experts. Thanks for asking!
page_content='experts are combined. The experts are also self-org anize to find a good partitioning of the input \nspace and each expert model has its own subspace, a nd combination of all experts model the input' metadata={'source': '..\\pdfs\\arxiv_1202.3887.pdf', 'page': 2}
