In [1]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

# User defined libraries
from sitemap_crawler import get_urls_from_sitemap

In [2]:
urls = get_urls_from_sitemap('https://jayeshmahapatra.github.io/sitemap.xml')

In [3]:
# Load websites
website_urls = get_urls_from_sitemap('https://jayeshmahapatra.github.io/sitemap.xml')
loader = WebBaseLoader(website_urls)

docs = loader.load()

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")

In [5]:
separators = ["\n\n", "\n", "\\[", "//]", "\\(", '\\)',  " ", ""]
chunk_size = 512

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=int(chunk_size/10),
    add_start_index=True,
    strip_whitespace=True,
    separators=separators,
)
documents = text_splitter.split_documents(docs)
vector = FAISS.from_documents(documents, embeddings)

In [6]:
from langchain_community.llms import Ollama
llm = Ollama(model="mistral:instruct")

In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

prompt = ChatPromptTemplate.from_template("""<s> [INST] You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. [/INST] </s> 
[INST] Question: {input} 
Context: {context} 
Answer: [/INST]""")

document_chain = create_stuff_documents_chain(llm, prompt)

In [8]:
from langchain.chains import create_retrieval_chain

retriever = vector.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [9]:
# response = retrieval_chain.invoke({"input": "Why use arcface loss?"})
# print(response["answer"])

# LangSmith offers several features that can help with testing:...
# response

In [15]:
query = "What are RoPE embeddings?"
chunks = []
metadata = []

for chunk in retrieval_chain.stream({"input": query}):
    if "answer" in chunk:
        chunks.append(chunk)
        print(chunk['answer'], end="", flush=True)
    else:
        metadata.append(chunk)
    


 RoPE embeddings are positional embeddings used in transformer models, applied after the self-attention mechanism. They are computed using the rotary position embedding function `apply_rotary_emb`, with cosine and sine frequencies generated by `precompute_freqs_cis` function. These embeddings help to capture relative positional information.

In [17]:
metadata

[{'input': 'What are RoPE embeddings?'},
 {'context': [Document(page_content='# QKV\n        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)\n        xq = xq.reshape(bsz, seqlen, self.n_local_heads, self.head_dim)\n        xk = xk.reshape(bsz, seqlen, self.n_local_kv_heads, self.head_dim)\n        xv = xv.reshape(bsz, seqlen, self.n_local_kv_heads, self.head_dim)\n\n        # RoPE relative positional embeddings\n        xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)', metadata={'source': 'https://jayeshmahapatra.github.io/2023/12/03/llama2.html', 'title': 'llama2.npy : Implementing Llama2 LLM using just Python and Numpy | Jayesh’s Blog', 'description': 'Large Language Models (LLMs), such as GPT-4, Claude, and Llama2, have reshaped the landscape of Natural Language Processing (NLP), democratizing AI applications. These models often have billions of parameters and are trained on massive datasets of text, often crawled from the internet.', 'language': 'en', 'start_index': 13577})

## Rough

In [1]:
import chromadb
from constants import CHROMA_DOCS_INDEX_NAME

In [2]:
# Create Chroma client and vectorstore
chroma_client = chromadb.HttpClient(host='localhost', port=8000)

In [3]:
# Create Chroma schema if it does not exist
collection = chroma_client.get_or_create_collection(CHROMA_DOCS_INDEX_NAME)

In [4]:
collection.count()

246

In [8]:
collection_data = collection.get()

In [12]:
collection_data.keys()

dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'data', 'uris'])