In [1]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [2]:
# Load websites
website_urls = ["https://jayeshmahapatra.github.io/2023/06/22/arcface.html",
                "https://jayeshmahapatra.github.io/2023/05/28/triton.html",
                "https://jayeshmahapatra.github.io/2023/12/03/llama2.html"]
loader = WebBaseLoader(website_urls)

docs = loader.load()

In [3]:
docs

[Document(page_content="\n\n\n\n\nEnhancing Embedding Separation with ArcFace Loss | Jayesh’s Blog\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJayesh's Blog\n\n\n\n\n\n\n\n\nBlog Archive\n\n\n\n\n\n\n\n\n\n\nEnhancing Embedding Separation with ArcFace Loss\n\nJun 22, 2023\n      \n      • Jayesh Mahapatra\n\n\n\nShare on: \n\n\n\n\n\n\n\n\n\n\n\n\n\nEmbeddings play a crucial role in Machine Learning by capturing and representing relationships between objects.\nEmbeddings can be obtained from Neural Networks trained with traditional classification losses. However, these losses do not explicitly optimize cosine distances to achieve both inter-class separability and intra-class compactness.\nIn this article, we will delve into ArcFace loss and its suitability for tasks that require high degree of inter-class separability while still having high intra-class variance, such as face recognition.\n\nEmbeddings & Separability\nBefore we start discussing losses, let’s take a refresher on what embed

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [5]:
separators = ["\n\n", "\n", "\\[", "//]", "\\(", '\\)',  " ", ""]
chunk_size = 256

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=int(chunk_size/10),
    add_start_index=True,
    strip_whitespace=True,
    separators=separators,
)
documents = text_splitter.split_documents(docs)
vector = FAISS.from_documents(documents, embeddings)

In [8]:
# See the chunks
len(documents[0].page_content)

3917

In [6]:
from langchain_community.llms import Ollama
llm = Ollama(model="mistral:instruct")

In [7]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

prompt = ChatPromptTemplate.from_template("""<s> [INST] You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. [/INST] </s> 
[INST] Question: {input} 
Context: {context} 
Answer: [/INST]""")

document_chain = create_stuff_documents_chain(llm, prompt)

In [8]:
from langchain.chains import create_retrieval_chain

retriever = vector.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

In [9]:
# response = retrieval_chain.invoke({"input": "Why use arcface loss?"})
# print(response["answer"])

# LangSmith offers several features that can help with testing:...
# response

In [10]:
query = "Why use arcface loss?"
chunks = []
metadata = []

for chunk in retrieval_chain.stream({"input": query}):
    print(chunk)
    # if "answer" in chunk:
    #     chunks.append(chunk)
    #     print(chunk['answer'], end="", flush=True)
    # else:
    #     metadata.append(chunk)
    


{'input': 'Why use arcface loss?'}
{'context': [Document(page_content="\\[L_3 = -log \\frac{e^{s \\cos (\\theta_{y_i} + m) }}{e^{s\\cos (\\theta_{y_i} + m)} + \\sum^{N}_{j=1,j\\neq y_i} e^{s \\cos \\theta_j}}\\]\n\nThis is the ArcFace Loss that is widely used in training face recognition systems.\n\nExperimental Results\nIn order to demonstrate the difference in class separation when using arcface loss vs classic softmax, I have created a github repository called ArcFace-Embedding-Visualization.\nThis repository contains contains code for visualizing how embeddings evolve when trained using ArcFace vs Softmax Loss, as shown below:\n\nVisualization of Embedding Separation across Training Epochs\n\n\n\n\n\n            ArcFace 3D embeddings during training.\n         \n\n\n\n            Softmax 3D embeddings during training.\n         \n\n\n\nAs we can see that the arcface loss results in embedding clusters that are more cleanly separated as well as are more compact than the embeddings tr

KeyboardInterrupt: 

In [None]:
# print(metadata[1])