In [None]:
import os
import langchain
import openai

api_key_path = os.path.join(
    "C:\\Users", "DXT6", "OneDrive - PETROBRAS",
    "CENPES", "TEO", "keys", "OpenAI", "apikey.txt"
)

with open(api_key_path, "r") as f:
    api_key = f.read()

In [None]:
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI

# llm = OpenAI(openai_api_key=api_key)
# chat_model = ChatOpenAI(openai_api_key=api_key)

# text = "What would be a good company name for a company that makes colorful socks?"

# # print(llm.predict(text))
# print(chat_model.predict(text))

In [None]:
from langchain.document_loaders import PyPDFLoader

pdfs_base = os.path.join("..", "..", "data", "pdfs")
loader = PyPDFLoader(os.path.join(pdfs_base, "2102.04906.pdf"))
pages = loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

chunk_size = 500
chunk_overlap = 50

In [None]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [None]:
# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader(os.path.join(pdfs_base, "2102.04906.pdf")),
    PyPDFLoader(os.path.join(pdfs_base, "2102.04906.pdf")),
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [None]:
splits = r_splitter.split_documents(docs)

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings(openai_api_key=api_key)

In [None]:
from langchain.vectorstores import Chroma

persist_directory = os.path.join("..", "..", "data", "chroma")
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [None]:
# from langchain.vectorstores import Chroma
# from langchain.embeddings.openai import OpenAIEmbeddings

# persist_directory = os.path.join("..", "..", "data", "chroma")
# embedding = OpenAIEmbeddings(openai_api_key=api_key)
# vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

In [None]:
print(vectordb._collection.count())

In [None]:
question = "What are the types of dynamic neural network?"
docs = vectordb.similarity_search(question,k=3)
len(docs)

In [None]:
vectordb.persist()

In [None]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [None]:
smalldb = Chroma.from_texts(texts, embedding=embedding)

In [None]:
question = "Tell me about all-white mushrooms with large fruiting bodies"
smalldb.similarity_search(question, k=2)

In [None]:
smalldb.max_marginal_relevance_search(question,k=2, fetch_k=3)

In [None]:
question = "what did they say about regression in the third lecture?"
docs = vectordb.similarity_search(
    question,
    k=3,
    filter={"source": os.path.join(pdfs_base, "2102.04906.pdf")}
)

In [None]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [None]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The lecture the chunk is from, should be one of `2102.04906.pdf`",
        type="string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the lecture",
        type="integer",
    ),
]

In [None]:
document_content_description = "Lecture notes"
llm = OpenAI(openai_api_key=api_key, temperature=0)
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [None]:
question = "what is dynamic?"
docs = retriever.get_relevant_documents(question)
print(docs)

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [None]:
compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=vectordb.as_retriever()
)

In [None]:
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

question = "what is dynamic?"
compressed_docs = compression_retriever.get_relevant_documents(question) # search_type = "mmr"
pretty_print_docs(compressed_docs)

#### Question Answering

In [None]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever()
)

In [None]:
question = "what is dynamic?"
result = qa_chain({"query": question})
result["result"]

##### Templates

In [None]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [None]:
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [None]:
question = "Is neural networks a topic?"
result = qa_chain({"query": question})
print(result["result"])
print(result["source_documents"][0])