In [1]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

# User defined libraries
from sitemap_crawler import get_urls_from_sitemap

In [2]:
urls = get_urls_from_sitemap('https://jayeshmahapatra.github.io/sitemap.xml')

In [3]:
# Load websites
website_urls = get_urls_from_sitemap('https://jayeshmahapatra.github.io/sitemap.xml')
loader = WebBaseLoader(website_urls)

docs = loader.load()

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")

In [5]:
separators = ["\n\n", "\n", "\\[", "//]", "\\(", '\\)',  " ", ""]
chunk_size = 512

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=int(chunk_size/10),
    add_start_index=True,
    strip_whitespace=True,
    separators=separators,
)
documents = text_splitter.split_documents(docs)
vector = FAISS.from_documents(documents, embeddings)

In [6]:
from langchain_community.llms import Ollama
llm = Ollama(model="mistral:instruct")

In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

prompt = ChatPromptTemplate.from_template("""<s> [INST] You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. [/INST] </s> 
[INST] Question: {input} 
Context: {context} 
Answer: [/INST]""")

document_chain = create_stuff_documents_chain(llm, prompt)

In [8]:
from langchain.chains import create_retrieval_chain

retriever = vector.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [9]:
# response = retrieval_chain.invoke({"input": "Why use arcface loss?"})
# print(response["answer"])

# LangSmith offers several features that can help with testing:...
# response

In [15]:
query = "What are RoPE embeddings?"
chunks = []
metadata = []

for chunk in retrieval_chain.stream({"input": query}):
    if "answer" in chunk:
        chunks.append(chunk)
        print(chunk['answer'], end="", flush=True)
    else:
        metadata.append(chunk)
    


 RoPE embeddings are positional embeddings used in transformer models, applied after the self-attention mechanism. They are computed using the rotary position embedding function `apply_rotary_emb`, with cosine and sine frequencies generated by `precompute_freqs_cis` function. These embeddings help to capture relative positional information.

In [17]:
metadata

[{'input': 'What are RoPE embeddings?'},
 {'context': [Document(page_content='# QKV\n        xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)\n        xq = xq.reshape(bsz, seqlen, self.n_local_heads, self.head_dim)\n        xk = xk.reshape(bsz, seqlen, self.n_local_kv_heads, self.head_dim)\n        xv = xv.reshape(bsz, seqlen, self.n_local_kv_heads, self.head_dim)\n\n        # RoPE relative positional embeddings\n        xq, xk = apply_rotary_emb(xq, xk, freqs_cos, freqs_sin)', metadata={'source': 'https://jayeshmahapatra.github.io/2023/12/03/llama2.html', 'title': 'llama2.npy : Implementing Llama2 LLM using just Python and Numpy | Jayesh’s Blog', 'description': 'Large Language Models (LLMs), such as GPT-4, Claude, and Llama2, have reshaped the landscape of Natural Language Processing (NLP), democratizing AI applications. These models often have billions of parameters and are trained on massive datasets of text, often crawled from the internet.', 'language': 'en', 'start_index': 13577})

### Unstructured IO

In [18]:
from langchain_community.document_loaders import UnstructuredURLLoader
from langchain_community.document_loaders import WebBaseLoader

In [19]:
print(urls[1])
unstructured_loader = UnstructuredURLLoader([urls[1]])
web_loader = WebBaseLoader([urls[1]])

https://jayeshmahapatra.github.io/2023/05/28/triton.html


In [20]:
uns_docs = unstructured_loader.load()
web_docs = web_loader.load()

In [21]:
# Compare by printing documents
print(uns_docs)
print(web_docs)

[Document(page_content='Beyond FastAPI: Using Nvidia Triton for serving ML models\n\nMay 28, 2023\n      \n      • Jayesh Mahapatra\n\nShare on:\n\nServing Models\n\nIn today’s world, ML engineers are taking on the role of “full stack” professionals, not only developing new models but also deploying them. Python has emerged as the language of choice in the field of Machine Learning, leading engineers to gravitate towards Python-based web frameworks for model deployment. While FastAPI, a modern and high-performance web framework for building APIs in Python, has become a popular choice, it does have certain limitations when it comes to serving ML models effectively.\n\nThe Limitations of FastAPI\n\nFastAPI, with its support for asynchronous request processing and data validation, empowers ML engineers to write REST API endpoints for ML inference. However, it wasn’t explicitly designed to serve as a dedicated inference engine. As a result, it lacks built-in support for ML frameworks and i

In [26]:
uns_docs[0].metadata['source']

{'source': 'https://jayeshmahapatra.github.io/2023/05/28/triton.html'}

In [24]:
web_docs[0].metadata

{'source': 'https://jayeshmahapatra.github.io/2023/05/28/triton.html',
 'title': 'Beyond FastAPI: Using Nvidia Triton for serving ML models | Jayesh‚Äôs Blog',
 'description': 'Serving Models',
 'language': 'en'}

In [23]:
uns_docs[0].metadata

{'source': 'https://jayeshmahapatra.github.io/2023/05/28/triton.html'}

### Chroma

In [1]:
import chromadb
from chromadb.config import Settings as ChromaSettings
import configparser
import os

In [2]:
from dotenv import load_dotenv
load_dotenv('keys.env')


True

In [3]:
config = configparser.ConfigParser()
config.read('dev.config')

['dev.config']

In [5]:
# Create Chroma client and vectorstore
chroma_client = chromadb.HttpClient(
    host= "chroma.jayeshdev.com",
    port=80,
    ssl= True,
    settings = ChromaSettings(
    chroma_api_impl="chromadb.api.fastapi.FastAPI",
    chroma_client_auth_provider="chromadb.auth.token.TokenAuthClientProvider",
    chroma_client_auth_credentials=os.environ.get("CHROMA_API_KEY", "not_provided")
    ))

KeyboardInterrupt: 

In [31]:
collection = chroma_client.get_or_create_collection(config.get('Chroma', 'collection_name'))
collection.count()

48

In [34]:
all_ids = collection.get()['ids']

In [36]:
collection.delete(all_ids)

In [7]:
# Create Chroma schema if it does not exist
#collection = chroma_client.get_or_create_collection()

# Delete Chroma collection if it exists
chroma_client.delete_collection(name = config.get('Chroma', 'collection_name'))

In [37]:
collection.count()

0

In [8]:
collection_data = collection.get()

In [12]:
collection_data.keys()

dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'data', 'uris'])