In [1]:
from typing import Annotated
from typing_extensions import TypedDict

from langgraph.graph import END, StateGraph, START
from langgraph.graph.message import add_messages

In [2]:
class ChatState(TypedDict):
    messages: Annotated[list, add_messages]
    faq_answerable: Annotated[bool, True]
    retrieved_faq_docs: Annotated[list, []]

In [3]:

def faq_node(state: ChatState):
    last_message = state['messages'][-1]

In [4]:
from fuzzywuzzy import fuzz
import json



def answer_faq(question: str, faq_data_path: str, threshold: int = 80):
    with open(faq_data_path, "r") as f:
        faq_data=json.load(f)
    faq_data=faq_data["faqs"]
    for doc in faq_data:
        questions = [doc['q']] + doc['q_variants']
        score = max(fuzz.partial_ratio(question, q) for q in questions)
        if score >= threshold:
            return {
                "answer": doc['a_md'],
                "source": doc['source']
                }
    return None
            

answer_faq("SOC compliance", "data/faq_data.json", 50)

from langchain_text_splitters import RecursiveCharacterTextSplitter
import json

with open("data/knowledge_base.txt", "r", encoding="utf-8") as f:
    data = f.read()

splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=150)
chunks = splitter.split_text(data)
metadatas = [{"source": "knowledge_base.txt"} for _ in range(len(chunks))]



len(chunks), len(metadatas)

import torch
from langchain_huggingface import HuggingFaceEmbeddings

def embed_chunks(chunks: list, model_name:str="sentence-transformers/all-mpnet-base-v2"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")
    embedder = HuggingFaceEmbeddings(model_name=model_name, model_kwargs={"device": device})
    embeddings = embedder.embed_documents(chunks)   
    return embeddings

embeddings = embed_chunks(chunks)


print(f"""
embeddings length: {len(embeddings)}
number of chunks: {len(chunks)}
number of metadatas: {len(metadatas)}
""")

len(embeddings[0])

from langchain_community.vectorstores import FAISS

embeddings_and_text = list(zip(chunks,embeddings))

vs = FAISS.from_embeddings(embeddings_and_text, metadatas)
vs.save_local("faiss_store")

In [5]:
from utils.vector_store import VectorStore

vs = VectorStore("faiss_store")
# vs.ingest_data("data/knowledge_base.txt")
query = "What are the api rate limits for the free plan?"
context = vs.vector_search(query, 3)


In [6]:
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate

model = ChatOllama(model="llama3.2")

prompt = ChatPromptTemplate.from_template(
    """
    {query}
    {context}
    """
)

chain = prompt | model

response = chain.invoke({"context": context, "query": query})
print(response.content)

The API rate limits for the free plan are:

* 600 requests per minute per organization
* Up to 50 concurrent jobs per workspace

This means that you can make a certain number of requests within a given time frame (600 requests per minute) and have up to a certain number of concurrent jobs running at the same time (50 jobs).


context

for item in context:
    print(item[0].page_content)