In [14]:
import os
import openai
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file

openai.api_key  = os.environ['OPENAI_API_KEY']

In [2]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/ai_adoption_framework_whitepaper.pdf")

In [3]:
pages = loader.load()

In [None]:
len(pages)

In [None]:
page = pages[0]
page

In [None]:
page.metadata

In [7]:
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

In [None]:
url="https://www.youtube.com/watch?v=rXIYVvoSoDY"
save_dir="docs/youtube/"
loader = GenericLoader(
    YoutubeAudioLoader([url],save_dir),
    OpenAIWhisperParser()
)
docs = loader.load()

In [None]:
docs[0].page_content[0:500]

In [15]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [16]:
chunk_size =26
chunk_overlap = 4

In [17]:
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)
c_splitter = CharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

In [18]:
text1 = 'This is a test sentence. This is another test sentence. This is a third test sentence.'

In [None]:
r_splitter.split_text(text1)

In [1]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFLoader
import os
import sys

In [2]:

# Load PDF
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("data/ai_adoption_framework_whitepaper.pdf"),
    PyPDFLoader("data/profile.pdf"),
    PyPDFLoader("data/prog.pdf")
]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [3]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [4]:
splits = text_splitter.split_documents(docs)

In [None]:
len(splits)

In [None]:
embedding = HuggingFaceEmbeddings()

In [13]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside, but walking my dog made tolerable"

In [14]:
embedding1 = embedding.embed_query(sentence1)
embedding2 = embedding.embed_query(sentence2)
embedding3 = embedding.embed_query(sentence3)

In [9]:
import numpy as np

In [None]:
np.dot(embedding1, embedding2)

In [None]:
np.dot(embedding1, embedding3)

In [None]:
np.dot(embedding2, embedding3)

In [16]:
from langchain.vectorstores import Chroma

In [17]:
persist_directory = 'docs/chroma/'

In [None]:
!rm -rf ./docs/chroma  # remove old database files if any

In [19]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [None]:
print(vectordb._collection.count())

In [21]:
question = "is there an email i can ask for help"

In [28]:
question = "what is flood insurance?"

In [33]:
docs = vectordb.similarity_search(question,k=2)

In [None]:
len(docs)

In [None]:
docs[0].page_content

In [None]:
vectordb.persist()

In [None]:
for doc in docs:
    print(doc.metadata)

In [None]:
print(vectordb._collection.count())

In [38]:
texts = [
    """The Amanita phalloides has a large and imposing epigeous (aboveground) fruiting body (basidiocarp).""",
    """A mushroom with a large fruiting body is the Amanita phalloides. Some varieties are all-white.""",
    """A. phalloides, a.k.a Death Cap, is one of the most poisonous of all known mushrooms.""",
]

In [39]:
smalldb = Chroma.from_texts(texts, embedding=embedding)

In [40]:
question = "Tell me about all-white mushrooms with large fruiting bodies"

In [None]:
smalldb.similarity_search(question, k=2)

In [None]:
smalldb.max_marginal_relevance_search(question, k=2, fetch_k=3)

In [50]:
from langchain_ollama.llms import OllamaLLM


In [51]:
model = OllamaLLM(model="orca-mini")


In [52]:
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [53]:
from langchain.chains import RetrievalQA

In [55]:
docs = vectordb.similarity_search(question,k=2)

In [None]:
len(docs)

In [None]:
print(docs[0].page_content)

In [58]:
question = "what is flood insurance"

In [60]:
qa_chain = RetrievalQA.from_chain_type(
    model,
    retriever=vectordb.as_retriever()
)

In [None]:
result = qa_chain({"query": question})

In [None]:
result["result"]

In [65]:
question = "what is the telephone number of the insrance company?"

In [None]:
result = qa_chain({"query": question})
result["result"]

In [67]:
from langchain.prompts import PromptTemplate

# Build prompt
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)


In [68]:
# Run chain
qa_chain = RetrievalQA.from_chain_type(
    model,
    retriever=vectordb.as_retriever(),
    return_source_documents=True,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

In [None]:
result = qa_chain({"query": question})
result["result"]

In [None]:
result["source_documents"][0]

In [None]:
qa_chain_mr = RetrievalQA.from_chain_type(
    model,
    retriever=vectordb.as_retriever(),
    chain_type="refine"
)
result = qa_chain_mr({"query": question})
result["result"]