In [None]:
# ingest.py
import os, re, glob, json
import numpy as np, faiss
from openai import OpenAI
import streamlit as st 
from typing import List
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

DATA_DIR = "data"
VECTOR_DIR = "vectorstore"
EMBED_MODEL = "togethercomputer/m2-bert-80M-32k-retrieval"
BATCH_SIZE = 32

client = OpenAI(api_key=st.secrets["OPENAI_API_KEY"],
                base_url="https://api.together.xyz/v1")

# ARTICLE_RE = re.compile(r"(?mi)^\s*(?:Article|Straipsnis)\s+(\d+[A-Za-z\-]*)\.?\s*(.*)$")
ARTICLE_RE = re.compile(r"(?i)\b(?:Article|Straipsnis)\s+(\d+[A-Za-z\-]*)\.?\s*([^0-9\n]*)")

def clean_text(t): return re.sub(r"\s+", " ", t).strip()

# def load_docs():
#     texts = []
#     for p in glob.glob(os.path.join(DATA_DIR, "**/*"), recursive=True):
#         if p.lower().endswith(".txt"):
#             texts.append((open(p, encoding="utf-8").read(), {"source": p}))
#     if not texts:
#         raise RuntimeError("No docs in ./data")
#     return [(clean_text(t), m) for t,m in texts]
def load_docs() -> List:
    docs = []
    for path in glob.glob(os.path.join(DATA_DIR, "**/*"), recursive=True):
        if path.lower().endswith(".pdf"):
            docs.extend(PyPDFLoader(path).load())
        elif path.lower().endswith(".txt") or path.lower().endswith(".md"):
            docs.extend(TextLoader(path, encoding="utf-8").load())
    if not docs:
        raise RuntimeError("No documents found in ./data. Add PDFs or txt/md files.")
    return docs



def split_articles(docs):
    chunks = []
    for txt, meta in docs:
        matches = list(ARTICLE_RE.finditer(txt))
        if not matches:
            chunks.append({"text": txt, "meta": {**meta, "article_id":"ALL","chunk_id":0}})
        for i,m in enumerate(matches):
            start, end = m.start(), matches[i+1].start() if i+1<len(matches) else len(txt)
            art_id, art_title = m.group(1), m.group(2) or f"Article {m.group(1)}"
            body = txt[start:end]
            chunks.append({"text": body, "meta": {**meta, "article_id":art_id, "article_title":art_title,"chunk_id":0}})
    return chunks

def embed_texts(texts):
    vecs=[]
    for i in range(0,len(texts),BATCH_SIZE):
        batch=texts[i:i+BATCH_SIZE]
        res=client.embeddings.create(model=EMBED_MODEL, input=batch)
        vecs.extend([d.embedding for d in res.data])
    return vecs

# def main():
#     os.makedirs(VECTOR_DIR,exist_ok=True)
#     docs = load_docs()
#     articles = split_articles(docs)
#     texts = [c["text"] for c in articles]
#     vecs = embed_texts(texts)

#     index=faiss.IndexFlatL2(len(vecs[0]))
#     index.add(np.array(vecs).astype("float32"))

#     faiss.write_index(index, os.path.join(VECTOR_DIR,"index.faiss"))
#     with open(os.path.join(VECTOR_DIR,"docstore.json"),"w",encoding="utf-8") as f:
#         json.dump(articles,f,ensure_ascii=False)
#     print(f"✅ Built FAISS index with {len(articles)} articles")

# if __name__=="__main__":
#     main()


In [16]:
os.makedirs(VECTOR_DIR,exist_ok=True)
docs = load_docs()
articles = split_articles(docs)
texts = [c["text"] for c in articles]
vecs = embed_texts(texts)

index=faiss.IndexFlatL2(len(vecs[0]))
index.add(np.array(vecs).astype("float32"))

faiss.write_index(index, os.path.join(VECTOR_DIR,"index.faiss"))
with open(os.path.join(VECTOR_DIR,"docstore.json"),"w",encoding="utf-8") as f:
    json.dump(articles,f,ensure_ascii=False)
print(f"✅ Built FAISS index with {len(articles)} articles")

ValueError: too many values to unpack (expected 2)

In [17]:
articles

[{'text': 'streamlit>=1.35 langchain>=0.2.10 langchain-core>=0.2.10 langchain-openai>=0.1.7 langchain-community>=0.2.10 langgraph>=0.2.21 faiss-cpu>=1.8.0 pydantic>=2.7 pypdf>=4.2.0 tiktoken>=0.7.0',
  'meta': {'source': 'requirements.txt', 'article_id': 'ALL', 'chunk_id': 0}}]

In [None]:
# ingest.py
import os, re, glob, json
import numpy as np, faiss
from openai import OpenAI
from typing import List
import streamlit as st 
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_core.documents import Document

# ==== Config ====
DATA_DIR = "data"
VECTOR_DIR = "vectorstore"
EMBED_MODEL = "togethercomputer/m2-bert-80M-32k-retrieval"
BATCH_SIZE = 32
CHUNK_SIZE = 1200   # child chunk size
CHUNK_OVERLAP = 150

# Together client
client = OpenAI(
    api_key=st.secrets["OPENAI_API_KEY"],
    base_url="https://api.together.xyz/v1"
)

ARTICLE_RE = re.compile(r"(?i)\b(?:Article|Straipsnis)\s+(\d+[A-Za-z\-]*)\.?\s*([^0-9\n]*)")


def clean_text(t: str) -> str:
    return re.sub(r"\s+", " ", t).strip()

# ==== 1. Load PDFs/TXT into Documents ====
def load_docs() -> List[Document]:
    docs: List[Document] = []
    for path in glob.glob(os.path.join(DATA_DIR, "**/*"), recursive=True):
        lp = path.lower()
        if lp.endswith(".pdf"):
            docs.extend(PyPDFLoader(path).load())
        elif lp.endswith(".txt") or lp.endswith(".md"):
            docs.extend(TextLoader(path, encoding="utf-8").load())
    if not docs:
        raise RuntimeError("❌ No documents found in ./data")
    return docs

# ==== 2. Split into articles ====
def split_articles(docs: List[Document]):
    chunks = []
    for d in docs:
        txt = clean_text(d.page_content)
        meta = d.metadata
        matches = list(ARTICLE_RE.finditer(txt))

        if not matches:
            chunks.append({
                "text": txt,
                "meta": {**meta, "article_id": "ALL", "article_title": "FULL", "chunk_id": 0}
            })
            continue

        for i, m in enumerate(matches):
            start = m.start()
            end = matches[i+1].start() if i+1 < len(matches) else len(txt)
            art_id = m.group(1)
            art_title = (m.group(2) or f"Article {m.group(1)}").strip()
            body = txt[start:end]
            chunks.append({
                "text": body,
                "meta": {**meta, "article_id": art_id, "article_title": art_title, "chunk_id": 0}
            })
    return chunks

# ==== 3. Split long articles into child chunks ====
def split_children(parents, chunk_size=CHUNK_SIZE, overlap=CHUNK_OVERLAP):
    children = []
    for p in parents:
        text = p["text"]
        meta = p["meta"]
        start, idx = 0, 0
        while start < len(text):
            end = min(len(text), start + chunk_size)
            sub = text[start:end]
            child_meta = {**meta, "chunk_id": idx}
            children.append({"text": sub, "meta": child_meta})
            if end == len(text):
                break
            start = end - overlap
            idx += 1
    return children

# ==== 4. Embed ====
def embed_texts(texts: List[str]):
    vecs = []
    for i in range(0, len(texts), BATCH_SIZE):
        batch = texts[i:i+BATCH_SIZE]
        res = client.embeddings.create(model=EMBED_MODEL, input=batch)
        vecs.extend([d.embedding for d in res.data])
    return vecs

# ==== 5. Main ====

os.makedirs(VECTOR_DIR, exist_ok=True)
docs = load_docs()
articles = split_articles(docs)
children = split_children(articles, CHUNK_SIZE, CHUNK_OVERLAP)

# texts = [c["text"] for c in children]
# vecs = embed_texts(texts)

# index = faiss.IndexFlatL2(len(vecs[0]))
# index.add(np.array(vecs).astype("float32"))

# faiss.write_index(index, os.path.join(VECTOR_DIR, "index.faiss"))
# with open(os.path.join(VECTOR_DIR, "docstore.json"), "w", encoding="utf-8") as f:
#     json.dump(children, f, ensure_ascii=False)

# print(f"✅ Built FAISS index with {len(children)} child chunks from {len(articles)} articles")

# # if __name__ == "__main__":
# #     main()


In [21]:
def clean_page_text(text: str) -> str:
    # Remove URLs, timestamps, “Printed from” etc
    text = re.sub(r"https?://\S+", "", text)
    text = re.sub(r"\d+/\d+/\d+,\s*\d+:\d+\s*(AM|PM)?", "", text)
    text = re.sub(r"Atspausdinta.*?lt", "", text, flags=re.I)
    return re.sub(r"\s+", " ", text).strip()
def merge_docs(docs):
    return "\n".join([clean_page_text(d.page_content) for d in docs])
d1 = merge_docs(docs)
def split_articles(docs):
    full_text = merge_docs(docs)
    matches = list(ARTICLE_RE.finditer(full_text))
    chunks = []

    if not matches:
        chunks.append({
            "text": full_text,
            "meta": {"article_id": "ALL", "article_title": "FULL", "chunk_id": 0}
        })
        return chunks

    for i, m in enumerate(matches):
        start = m.start()
        end = matches[i+1].start() if i+1 < len(matches) else len(full_text)
        art_id = m.group(1)
        art_title = (m.group(2) or f"Article {art_id}").strip()
        body = full_text[start:end]
        chunks.append({
            "text": body,
            "meta": {"article_id": art_id, "article_title": art_title, "chunk_id": 0}
        })
    return chunks


articles = split_articles(d1)

AttributeError: 'str' object has no attribute 'page_content'

In [22]:
import json, numpy as np, faiss
from openai import OpenAI

VECTOR_DIR = "vectorstore"
EMBED_MODEL = "togethercomputer/m2-bert-80M-32k-retrieval"

index = faiss.read_index(f"{VECTOR_DIR}/index.faiss")
docstore = json.load(open(f"{VECTOR_DIR}/docstore.json", encoding="utf-8"))

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"], base_url="https://api.together.xyz/v1")

query = "What are the rules for residence permits in Lithuania?"
q_vec = client.embeddings.create(model=EMBED_MODEL, input=query).data[0].embedding

D, I = index.search(np.array([q_vec], dtype="float32"), k=3)
for idx, dist in zip(I[0], D[0]):
    chunk = docstore[idx]
    print("----")
    print("Article:", chunk["meta"].get("article_id"), chunk["meta"].get("article_title"))
    print("Page:", chunk["meta"].get("page"), "Dist:", dist)
    print(chunk["text"][:300], "...")


----
Article: ALL FULL
Page: 31 Dist: 0.07813991
idity of the travel document and, after the foreigner produces a new valid travel document, may be re-personalised for the remaining duration of validity of the temporary residence permit. 5. A foreigner being in possession of a temporary residence permit must, in the event of changes in the circums ...
----
Article: ALL FULL
Page: 31 Dist: 0.082836345
descent 1. A foreigner of Lithuanian descent may be issued a temporary residence permit if he provides proof of his Lithuanian descent. 2. A foreigner of Lithuanian descent shall be issued a temporary residence permit for five years. 8/23/25, 12:33 PM Republic of Lithuania Law on the Legal Status of ...
----
Article: ALL FULL
Page: 6 Dist: 0.09762402
 objective reasons is unable to obtain travel documents from his country of origin, where such a document grants him the right to leave and return to the Republic of Lithuania for the duration of validity of the document. 291. Resettlement/rel

In [26]:
for idx, dist in zip(I[0], D[0]):
    chunk = docstore[idx]
    print("----")
    print("Article:", chunk["meta"].get("article_id"), chunk["meta"].get("article_title"))
    print("Page:", chunk["meta"].get("page"), "Dist:", dist)
    print(chunk["text"], "...")

----
Article: ALL FULL
Page: 31 Dist: 0.07813991
idity of the travel document and, after the foreigner produces a new valid travel document, may be re-personalised for the remaining duration of validity of the temporary residence permit. 5. A foreigner being in possession of a temporary residence permit must, in the event of changes in the circumstances due to which the permit has been issued, obtain a new temporary residence permit. 6. A family member who enters the Republic of Lithuania for residence accompanying a foreigner who applies for the issue of a temporary residence permit on the ground laid down in point 41, 42, 51, 13 or 16 of paragraph 1 of this Article or on the ground laid down in point 14 of paragraph 1 of this Article and fulfils the condition set out in Article 43(6)(3) of this Law or on the grounds laid down in points 1 to 22 of Article 45(1) of this Law or who accompanies a foreigner referred to in point 4 or 5 of Article 43(6) of this Law may apply for the issue o

MCP

In [None]:
# Looks like your MCP version doesn't export `tool` directly.
# In the latest `fastmcp`, tools are declared via decorators inside `FastMCP`.
# Let's fix rag_mcp.py accordingly.
import streamlit as st  
import os, json, faiss, numpy as np
from openai import OpenAI
from mcp.server.fastmcp import FastMCP

VECTOR_DIR = "vectorstore"
INDEX_PATH = os.path.join(VECTOR_DIR, "index.faiss")
DOCSTORE_PATH = os.path.join(VECTOR_DIR, "docstore.json")
EMBED_MODEL = "togethercomputer/m2-bert-80M-32k-retrieval"

# Load FAISS + docstore
index = faiss.read_index(INDEX_PATH)
docstore = json.load(open(DOCSTORE_PATH, encoding="utf-8"))

# Together client
client = OpenAI(
    api_key=st.secrets["OPENAI_API_KEY"],
    base_url="https://api.together.xyz/v1"
)

# Define MCP server
mcp = FastMCP("rag-mcp")

@mcp.tool()
def embed_and_search(query: str, k: int = 5):
    """Embed a query and return top-k FAISS matches (IDs + distances)."""
    q_vec = client.embeddings.create(model=EMBED_MODEL, input=query).data[0].embedding
    D, I = index.search(np.array([q_vec], dtype="float32"), k)
    return {"hits": [{"id": int(i), "dist": float(d)} for i, d in zip(I[0], D[0])]} 

@mcp.tool()
def get_context(doc_ids: list, window: int = 1, max_chars: int = 5000):
    """Return merged context for given doc IDs, expanding neighbors in same article."""
    expanded = []
    for i in doc_ids:
        base = docstore[int(i)]
        art_id = base["meta"].get("article_id")
        # expand all chunks from the same article
        article_chunks = [c for c in docstore if c["meta"].get("article_id") == art_id]
        expanded.extend(article_chunks)

    # Merge into one big context (truncate if too long)
    text = "\n".join([c["text"] for c in expanded])[:max_chars]
    return {"contexts": [text]}

if __name__ == "__main__":
    import asyncio
    asyncio.run(mcp.run_async())
    # await(mcp.run_async())


  def _is_wrapper(f):


AttributeError: 'FastMCP' object has no attribute 'run_async'

In [3]:
# !pip install nest_asyncio
import nest_asyncio, asyncio
nest_asyncio.apply()
# simple_graph.py
import asyncio
from langchain_openai import ChatOpenAI
from langgraph.graph import StateGraph, END
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, AIMessage

from mcp.client.stdio import stdio_client
from mcp import ClientSession, StdioServerParameters

# ---- State ----
from typing import TypedDict, List
class State(TypedDict):
    messages: List

# ---- Connect to MCP server ----
async def get_mcp_tools():
    server = StdioServerParameters(command="python", args=["simple_mcp.py"])
    read, write = await stdio_client(server)
    session = ClientSession(read, write)
    await session.initialize()

    tools = await session.list_tools()
    return session, tools.tools

# ---- Build agent ----
def build_graph(tools):
    g = StateGraph(State)
    llm = ChatOpenAI(
        model="meta-llama/Llama-3-8b-chat-hf",
        api_key="your_together_api_key",
        base_url="https://api.together.xyz/v1",
        temperature=0
    ).bind_tools(tools)

    def step(state: State):
        if not state["messages"]:
            return state
        ai = llm.invoke(state["messages"])
        state["messages"].append(ai)
        return state

    g.add_node("step", step)
    g.set_entry_point("step")
    g.add_edge("step", END)
    return g.compile()

# ---- Run ----
async def main():
    session, tools = await get_mcp_tools()
    graph = build_graph(tools)
    state = {"messages": [HumanMessage(content="Please greet Alice and add 2 + 3.")]}
    out = graph.invoke(state)
    print(out["messages"][-1].content)

if __name__ == "__main__":
    asyncio.run(main())
    # await main()


TypeError: object _AsyncGeneratorContextManager can't be used in 'await' expression