In [401]:
from parse_code import *
from parse_files import *
from print_contents import *

import json
import asyncio

from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors.base import BaseDocumentCompressor
from langchain.chains.base import Chain
from langchain_core.runnables import RunnableLambda
from langchain_core.callbacks import BaseCallbackHandler
from langchain_postgres import PGVector


from pydantic import Field
from typing import List, Optional, Sequence

import faiss
import numpy as np
from pathlib import Path
import torch
import subprocess
import os

import dotenv
#torch.cuda.is_available()
dotenv.load_dotenv(".env.private")

True

In [338]:
#some helper functions
# count tokens in the results
def count_tokens(text, tokenizer):
    tokens = tokenizer.encode(text)
    return len(tokens)

def count_all_tokens(texts, tokenizer):
    total_tokens = 0
    for text in texts:
        tokens = tokenizer.encode(text)
        total_tokens += len(tokens)
    return total_tokens

def join_metadata(metadata):
    key, value = list(metadata.items())[0]
    return f"{key}: {value}"

def run_command(command):
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    assert result.returncode == 0, f"Command '{command}' failed with error: {result.stderr}"

def format_document(doc):
    # Join all metadata key-value pairs as "key: value"
    meta_str = "\n".join(f"{k}: {v}" for k, v in doc.metadata.items())
    # Combine metadata and content for the prompt
    return f"{meta_str}\n{doc.page_content}"
def format_documents(docs):
    return "\n\n".join(format_document(doc) for doc in docs)

### Parsing the source material for retrieval

These are for parsing the source code and the documentation

In [None]:
# system calls -- I wrote some cmdline scripts to do the parsing
call_1 = "uv run parse_code.py ../App/python-server/ --replace-source ../App --replace-target ./App -o ./data/backend_code.txt"
call_2 = "uv run print_contents.py -d ../ -r --exclude-dirs exclude_dirs.txt --exclude-files exclude_files.txt -o ./data/project_structure.txt"
call_3 = "uv run parse_files.py --exclude-dirs exclude_dirs.txt --exclude-files exclude_files.txt -o ./data/project_files.txt ../"

In [24]:
run_command(call_1)
run_command(call_2)
run_command(call_3)

Saving the results

In [156]:
# This requires that raw_project_documents.json already exists
with open("./data/raw_project_documents.json", "r", encoding="utf-8") as f:
    file_dict = json.load(f)

documents_json = []
for file_path, content in file_dict.items():
    doc_type = "markdown document" if file_path.endswith(".md") else "document"
    documents_json.append({
        "file": file_path,
        "type": doc_type,
        "content": content
    })

with open("./data/project_documents.json", "w", encoding="utf-8") as f:
    json.dump(documents_json, f, ensure_ascii=False, indent=2)

In [157]:
# load the documents
with open("./data/project_documents.json", "r", encoding="utf-8") as f:
    documents_json = json.load(f)

In [158]:
# split to documents and markdown documents
documents_dicts = []
markdown_documents_dicts = []
for doc in documents_json:
    if doc["type"] == "document":
        documents_dicts.append(doc)
    elif doc["type"] == "markdown document":
        markdown_documents_dicts.append(doc)
    else:
        raise ValueError(f"Unknown document type: {doc['type']}")

In [404]:
with open('./data/backend_code.txt', 'r', encoding='utf-8') as f:
    code_data = json.load(f)

In [412]:
def format_code_entry(entry):
    meta = [
        f"file: {entry.get('file', '')}",
        f"type: {entry.get('type', '')}",
        f"name: {entry.get('name', '')}",
        f"start_line: {entry.get('start_line', '')}"
    ]
    docstring = entry.get('docstring', '')
    code = entry.get('code', '')
    meta_str = ", ".join(meta)
    docstring_str = f"\nDocstring:\n{docstring}" if docstring else ""
    return f"{meta_str}\n{docstring_str}\ncontent:\n{code}"

In [418]:
formatted_code_entries = [format_code_entry(entry) for entry in code_data]

In [152]:
len(documents_dicts), len(markdown_documents_dicts)

(10, 12)

In [403]:
bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs={"device": "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

In [103]:
print(f"Model's maximum sequence length: {SentenceTransformer('BAAI/bge-large-en-v1.5').max_seq_length}")

Model's maximum sequence length: 512


In [104]:
print(f"Model's embedding dimensionality: {len(bge_embeddings.embed_query('some random query'))}")

Model's embedding dimensionality: 1024


In [105]:
# Define which headers to split on and their metadata keys
headers_to_split_on = [
    ("#", "Heading 1"),
    ("##", "Sub heading"),
    ("###", "Sub-sub heading"),
]

# Initialize the Markdown splitter
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

In [160]:
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")

text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=512,
    chunk_overlap=50,
)

In [184]:
markdown_documents_dicts[0]

{'file': './App/README.md',
 'type': 'markdown document',
 'content': '# Eprice App\n\nThe Eprice App is a containerized application that allows users to view the market price of electricity in Finland, both current and historical. The app is built with a modern tech stack, including a Svelte frontend, a FastAPI backend, a PostgreSQL database, and various tools for testing and data management.\n\n## Features\n\n- **Electricity Price Viewer**: View current and historical electricity prices in Finland.\n- **Svelte Frontend**: A modern, responsive UI built with Svelte and Vite.\n- **FastAPI Backend**: A Python-based backend for handling API requests and business logic.\n- **PostgreSQL Database**: A robust database for storing electricity price data.\n- **Flyway Migrations**: Manage database schema changes with ease.\n- **Testing**: End-to-end tests with Playwright and backend API tests with Pytest.\n- **Chat Engine**: A chat-based interface for interacting with the app.\n- **Data Loading*

In [185]:
documents_dicts[0]

{'file': './App/python-server/requirements.txt',
 'type': 'document',
 'content': 'fastapi==0.115.12\nasyncpg==0.30.0\npasslib[bcrypt]==1.7.4\npython-jose==3.4.0\nuvicorn==0.34.2\npydantic[email]==2.11.4\ndotenv==0.9.9\nhttpx==0.28.1\nasyncio==3.4.3\napscheduler==3.11.0\nrequests==2.32.3\nfastapi-mail==1.4.2\n'}

In [213]:
# Load the model
model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
# Create the embeddings object
embedding_model = HuggingFaceEmbeddings(
    model_name="BAAI/bge-large-en-v1.5",
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

In [197]:
all_documents = documents_dicts + markdown_documents_dicts
data = []
for doc in all_documents:
    if doc["type"] == "document":
        chunks = text_splitter.split_text(doc["content"])
        for chunk in chunks:
            embedding = embedding_model.embed_query(chunk)
            data.append({
                "file": doc["file"],
                "type": doc["type"],
                "content": chunk,
                "metadata": None,
                "embedding": embedding
            })
    elif doc["type"] == "markdown document":
        chunks = markdown_splitter.split_text(doc["content"])
        for chunk in chunks:
            subchunks = text_splitter.split_text(chunk.page_content)
            for subchunk in subchunks:
                embedding = embedding_model.embed_query(subchunk)
                data.append({
                    "file": doc["file"],
                    "type": doc["type"],
                    "content": subchunk,
                    "metadata": join_metadata(chunk.metadata),
                    "embedding": embedding_model.embed_query(subchunk)
                })



In [None]:
# Chunk and embed code files, matching doc structure
code_chunks = []
for entry in code_data:
    # Combine docstring and code for context, or just use code
    docstring = entry.get("docstring", "")
    code_text = entry.get("code", "")
    full_text = f"{docstring}\n{code_text}" if docstring else code_text

    # Chunk the code
    chunks = text_splitter.split_text(full_text)
    for chunk in chunks:
        # Prepare metadata: include all keys except file, type, code, docstring, and start_line
        metadata = {k: v for k, v in entry.items() if k not in ["file", "type", "code", "docstring", "start_line"]}
        doc = Document(
            page_content=chunk,
            metadata={
                "file": entry.get("file", ""),
                "type": entry.get("type", ""),
                "metadata": metadata if metadata else None
            }
        )
        embedding = embedding_model.embed_query(chunk)
        code_chunks.append((doc, embedding))

# Unpack for later use
code_documents = [doc for doc, _ in code_chunks]
code_embeddings_list = [np.array(emb, dtype=np.float32) for _, emb in code_chunks]
code_embeddings_matrix = np.vstack(code_embeddings_list)

In [424]:
# Convert to Documents
documents = []
embeddings_list = []
for item in data:
    doc = Document(
        page_content=item['content'],
        metadata={
            'file': item['file'],
            'type': item['type'],
            'heading': item['metadata'],
        }
    )
    documents.append(doc)
    embeddings_list.append(np.array(item['embedding'], dtype=np.float32))

embeddings_matrix = np.vstack(embeddings_list)

In [None]:
# combine code and other documents
documents.extend(code_documents)
embeddings_matrix = np.vstack((embeddings_matrix, code_embeddings_matrix))

In [430]:
dimension = 1024#embeddings_matrix.shape[1]
index = faiss.IndexFlatL2(dimension)  # or another index type

# Add embeddings to the index
index.add(embeddings_matrix)

# Create docstore dict mapping index IDs to Document objects
# Create InMemoryDocstore wrapping your documents dict
docstore = InMemoryDocstore({i: doc for i, doc in enumerate(documents)})
index_to_docstore_id = {i: i for i in range(len(documents))}

# Correct FAISS vector store initialization
vector_store = FAISS(
    embedding_function=None,  # embeddings already computed
    index=index,              # FAISS index object here
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [None]:
connection_string = "postgresql+psycopg://username:password@localhost:5432/database"
collection_name = "document_collection"
vector_store = PGVector(
    embeddings=None,
    collection_name=collection_name,
    connection=connection_string,
    use_jsonb=True,  # optional, for metadata storage
)


In [None]:
vector_store.add_documents(
    documents=documents,
    embeddings=embeddings_matrix.tolist(),  # convert to list for JSON serialization
    metadata=[doc.metadata for doc in documents],
    ids=[str(i) for i in range(len(documents))],
)

['64937226-3a4a-4c6d-bc66-6954d4691630',
 'abd1bcbd-a5bb-4a8a-be1d-165c84458e59',
 'a2f8fd40-146d-4405-a8b7-1992dd3bf339',
 'ecad18ff-23d9-47a0-8f27-7fda01fa0327',
 'b7cbe391-3fe8-486e-a5a8-22f0531eae3f',
 '1f3d045a-705a-4572-9e1e-e7e136fc513b',
 '281a3a14-77b0-40b3-a3da-b5a312e06677',
 'd9953411-133f-4a5f-aedc-bee5d7cec853',
 'a47dd161-8dde-41c9-8ff7-736925d5cf40',
 '2c1c9d8a-6535-4bbb-8f43-d28bff7b530a',
 '5840af5b-b2cf-4a40-be44-91b4e2e810bd',
 'b585513c-c515-4bb4-9d88-2df5a1404e01',
 'e226328c-f5fa-4e10-8a65-ac294a964020',
 '0b97a414-cf46-4d7b-a874-d4639c096681',
 'b2dc8a08-2585-40fc-be6e-50243bf9e991',
 'fe085651-51a1-4d01-942c-c79460418409',
 '01f5b98a-fec9-4e60-b1a9-2e7a24b0dd05',
 '40a25806-4bd6-4dab-954a-26d7f2e2ddb3',
 '14964b9b-7bfb-4e21-a839-368ba44cff51',
 '2aa0ced8-5635-4a17-9dfb-6758e887ec6e',
 'a062f75b-8651-4d72-95df-62e47bf5cacf',
 '7e9880cd-9e5c-460b-97bb-0344b0da2bd7',
 '0f436ce7-d891-4cc7-8269-b1bf7d810842',
 '0ae2d888-a678-4d75-9503-f196088a527a',
 'b283ce30-a78f-

In [256]:
query = "Can you describe the project"
query_embedding = embedding_model.embed_query(query)  # get embedding for query

In [257]:
results = vector_store.similarity_search_by_vector(query_embedding, k=5)

for doc in results:
    print(f"File: {doc.metadata['file']}")
    print(f"Heading: {doc.metadata['heading']}")
    print(f"Content: {doc.page_content[:200]}...\n")

File: ./App/README.md
Heading: Heading 1: Eprice App
Content: This project us under MIT license: https://mit-license.org/...

File: ./Documents/diagrams/sources/use_case.wsd
Heading: None
Content: @startuml
title Use Case Diagram for Electricity Market App

actor "Signed-in User" as User
actor "Visitor" as Visitor

package "Frontend (Svelte)" {
    usecase "View Current Electricity Prices" as V...

File: ./Documents/diagrams/sources/authentication_use_case.wsd
Heading: None
Content: @startuml
title Use Case Diagram for authentication in Electricity Market App

actor "Signed-in User" as User
actor "Visitor" as Visitor

package "Frontend (Svelte)" {
    usecase "Register" as Regist...

File: ./Documents/project_description.md
Heading: Heading 1: System description
Content: This document describes the basic topology of the system (as defined in the compose.yaml file). The system consists of multiple services that work together to provide functionality, including a databa...

File: ./App/R

In [441]:
retriever = vector_store.as_retriever(search_kwargs={"k": 10})  # retrieve top 20 docs

In [442]:
class LLMReranker(BaseDocumentCompressor):
    llm_chain: object = Field(LLMChain, description="LLM chain to rerank documents")
    document_variable_name: str = "document"

    def compress_documents(
        self,
        documents: Sequence[Document],
        query: str,
        *,
        callbacks: Optional[list] = None,
    ) -> List[Document]:
        
        scored_docs = []
        for doc in documents:
            inputs = {
                "query": query,
                self.document_variable_name: doc.page_content,
            }
            output = self.llm_chain.run(inputs)
            try:
                score = int(output.strip())
            except Exception:
                score = 0
            scored_docs.append((doc, score))
        scored_docs.sort(key=lambda x: x[[1]], reverse=True)
        return [doc for doc, score in scored_docs]

In [443]:
# Initialize OpenAI chat model -- this is used for reranking
llm_reranker = ChatOpenAI(model="gpt-4o-nano", temperature=0)

# Define a template for reranking
rerank_prompt = PromptTemplate(
    input_variables=["query", "document"],
    template=(
        "Given the query:\n{query}\n\n"
        "Rate the relevance of the following document to the query on a scale from 1 to 10:\n"
        "{document}\n\n"
        "Only output the score as an integer."
    ),
)
class MyStreamingHandler(BaseCallbackHandler):
    def on_llm_new_token(self, token: str, **kwargs):
        print(token, end="", flush=True)  # or handle token as you want

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.5,
    streaming=True,
    callbacks=[MyStreamingHandler()],
)

In [444]:
# Create an LLMChain for reranking
rerank_chain = LLMChain(llm=llm_reranker, prompt=rerank_prompt)
# Instantiate your reranker
reranker = LLMReranker(llm_chain=rerank_chain)

# Wrap your base retriever with ContextualCompressionRetriever using the reranker
reranking_retriever = ContextualCompressionRetriever(
    base_retriever=retriever,
    base_compressor=reranker,
)

In [None]:
prompt_template = ChatPromptTemplate.from_template(
    "You are an expert assistant. Use the following project documents to answer the user's question."
    "If the answer is in the documents, provide it and reference the document(s) used."
    "If the answer is not in the documents, provide a general answer based on your knowledge,"
    "and state that the documents do not contain the answer.\n\n"
    "Context:\n"
    "{context}\n\n"
    "Question:\n"
    "{question}\n\n"
)
def full_chain(query: str):
    query_embedding = embedding_model.embed_query(query)
    reranked_docs = reranking_retriever.base_retriever.vectorstore.similarity_search_by_vector(query_embedding, k=5)
    context = format_documents(reranked_docs)
    response = llm.invoke(prompt_template.format(context=context, question=query))
    return response.content


In [445]:
async def full_chain_stream(query: str):
    # Embed query and retrieve reranked docs as before
    query_embedding = embedding_model.embed_query(query)
    reranked_docs = reranking_retriever.base_retriever.vectorstore.similarity_search_by_vector(query_embedding, k=5)
    context = format_documents(reranked_docs)

    # Prepare prompt input
    prompt_text = prompt_template.format(context=context, question=query)

    # Stream tokens asynchronously from the LLM
    async for token in llm.astream(prompt_text):
        yield token.content  # yield each token as it arrives

In [447]:
resp = ""
async for token in full_chain_stream("at what layer or level are exceptions handled in the backend?"):
    respo += token
    print(resp, end="", flush=True)  # Print each token as it arrives    

Exceptions in the backend are primarily handled at the **Controller** level. The controller is responsible for catching exceptions and returning valid JSON responses to the frontend, including appropriate error messages while ensuring that sensitive information is not leaked. This is part of the overall design pattern that emphasizes separation of concerns, where each layer has a single responsibility.

This information is referenced from the document titled "backend_design.md."