In [11]:
# If needed (run once per environment)
# !pip install -q langchain langchain-openai faiss-cpu python-dotenv tiktoken markdown-it-py unstructured

import os
import glob
from pathlib import Path
from dotenv import load_dotenv

from typing import List, Optional
from dataclasses import dataclass

# LangChain core
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# OpenAI integrations
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Vector store
from langchain_community.vectorstores import FAISS

# Document loaders / splitters
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [12]:
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY is not set")

# ---- Model knobs ----
# Use "gpt-5" if you have access; else "gpt-4o"
MODEL_NAME = os.getenv("OPENAI_MODEL", "gpt-5")
TEMPERATURE = float(os.getenv("OPENAI_TEMPERATURE", "0.2"))

llm = ChatOpenAI(model=MODEL_NAME, temperature=TEMPERATURE, api_key=OPENAI_API_KEY)
embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)


In [13]:
@dataclass
class NotesConfig:
    notes_dir: str = "./notes"   # change to your path
    glob_pattern: str = "**/*.md"
    chunk_size: int = 1200
    chunk_overlap: int = 150

config = NotesConfig(notes_dir="./notes")  # <-- set your folder
Path(config.notes_dir).mkdir(parents=True, exist_ok=True)  # ensure exists

# Example: drop your 3 files into ./notes first, then run this
def load_any_notes(notes_dir: Path):
    docs = []

    # .md / .txt
    for ext in ("*.md", "*.txt"):
        for fp in notes_dir.rglob(ext):
            docs.extend(TextLoader(str(fp), encoding="utf-8").load())

    # .pdf
    for fp in notes_dir.rglob("*.pdf"):
        docs.extend(PyPDFLoader(str(fp)).load())

    # .docx (optional)
    for fp in notes_dir.rglob("*.docx"):
        docs.extend(Docx2txtLoader(str(fp)).load())

    return docs

if not raw_docs:
    print(f"No markdown files found under: {config.notes_dir}. Add some .md files and re-run this cell.")
else:
    print(f"Loaded {len(raw_docs)} documents from {config.notes_dir}")


Loaded 3 documents from ./notes


In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=config.chunk_size,
    chunk_overlap=config.chunk_overlap,
    separators=["\n\n", "\n", " ", ""],
)

docs = text_splitter.split_documents(raw_docs)
print(f"Created {len(docs)} chunks")

vectorstore = FAISS.from_documents(docs, embeddings)
retriever = vectorstore.as_retriever(search_kwargs={"k": 4})

Created 63 chunks


In [8]:
def format_docs(docs):
    formatted = []
    for d in docs: 
        source = d.metadata.get("source", "unknown.md")
        # Some loaders may include line numbers; if not, we just return text.
        formatted.append(f"[SOURCE: {Path(source).name}]\n{d.page_content}")
    return "\n\n---\n\n".join(formatted)

RAG_PROMPT = PromptTemplate.from_template(
    """You are a helpful notes assistant. Use only the provided context to answer the question.
Cite sources like [SOURCE: filename.md] when relevant.

Question:
{question}

Context:
{context}

Answer succinctly, with bullet points when helpful, and list sources at the end."""
)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | RAG_PROMPT
    | llm
    | StrOutputParser()
)

def ask(question: str) -> str:
    """Query your markdown notes with retrieval-augmented generation."""
    return rag_chain.invoke(question)


In [9]:
# Try a generic question
print(ask("What are the main takeaways across these files?"))



- App structure: React app with a clear component/layout organization (Layout.jsx, Navbar.jsx, Sidebar.jsx), route-based pages, and separation of concerns via hooks, lib, and utils directories. [SOURCE: Design Doc.md]
- Caching and infra: Redis is central for performance and reliability, handling query caching, session storage, and rate limiting for features like user data, dashboards, alerts, analytics, and data sources. [SOURCE: Design Doc.md]
- State management: 
  - Local: useState/useReducer; shared state via Context. 
  - Data fetching: custom hooks, React Query-style caching, optimistic updates. 
  - Auth: Supabase Auth provider, JWT handling, and protected routes. [SOURCE: Design Doc.md]
- Data pipeline (Extract phase): Multiple intake paths—CSV uploads (validation, preview, schema detection), API integrations (OAuth, rate limits, incremental sync), and manual entry (copy/paste, forms). [SOURCE: Design Doc.md]
- Probabilistic modeling: Hidden Markov Models support key inference

In [10]:
import os
from dotenv import load_dotenv

load_dotenv()  # ensures LANGCHAIN_API_KEY is picked up if you put it in .env

# REQUIRED
os.environ["LANGCHAIN_TRACING_V2"] = "true"
# If not already set in your shell/.env, set your key here:
# os.environ["LANGCHAIN_API_KEY"] = "<your-langsmith-api-key>"

# OPTIONAL but recommended: keep runs grouped per app/session
os.environ["LANGCHAIN_PROJECT"] = os.getenv("LANGCHAIN_PROJECT", "Notes QA")

# OPTIONAL: custom endpoint (default is fine)
# os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
