In [53]:
from parse_code import *
from parse_files import *
from print_contents import *

import json
import asyncio

from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

#from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors.base import BaseDocumentCompressor
from langchain.chains.base import Chain
from langchain_core.runnables import RunnableLambda
from langchain_core.callbacks import BaseCallbackHandler
from langchain_postgres import PGVector
from langchain_core.chat_history import InMemoryChatMessageHistory
from langchain_core.messages import HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import SystemMessage
from langchain_openai import ChatOpenAI

from pydantic import Field
from typing import List, Optional, Sequence

import faiss
import numpy as np
from pathlib import Path
import torch
import subprocess
import os

import dotenv
#torch.cuda.is_available()
dotenv.load_dotenv(".env.private")

True

In [2]:
#some helper functions
# count tokens in the results
def count_tokens(text, tokenizer):
    tokens = tokenizer.encode(text)
    return len(tokens)

def count_all_tokens(texts, tokenizer):
    total_tokens = 0
    for text in texts:
        tokens = tokenizer.encode(text)
        total_tokens += len(tokens)
    return total_tokens

def join_metadata(metadata):
    key, value = list(metadata.items())[0]
    return f"{key}: {value}"

def run_command(command):
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    assert result.returncode == 0, f"Command '{command}' failed with error: {result.stderr}"

def format_document(doc):
    # Join all metadata key-value pairs as "key: value"
    meta_str = "\n".join(f"{k}: {v}" for k, v in doc.metadata.items())
    # Combine metadata and content for the prompt
    return f"{meta_str}\n{doc.page_content}"
def format_documents(docs):
    return "\n\n".join(format_document(doc) for doc in docs)

def format_code_entry(entry):
    meta = [
        f"file: {entry.get('file', '')}",
        f"type: {entry.get('type', '')}",
        f"name: {entry.get('name', '')}",
        f"start_line: {entry.get('start_line', '')}"
    ]
    docstring = entry.get('docstring', '')
    code = entry.get('code', '')
    meta_str = ", ".join(meta)
    docstring_str = f"\nDocstring:\n{docstring}" if docstring else ""
    return f"{meta_str}\n{docstring_str}\ncontent:\n{code}"

### Parsing the source material for retrieval

These are for parsing the source code and the documentation

In [567]:
# system calls -- I wrote some cmdline scripts to do the parsing
call_1 = "uv run parse_code.py ../App/python-server/ --replace-source ../App --replace-target ./App -o ./data/backend_code.txt"
call_2 = "uv run print_contents.py -d ../ -r --exclude-dirs exclude_dirs.txt --exclude-files exclude_files.txt -o ./data/project_structure.txt"
call_3 = "uv run parse_files.py --exclude-dirs exclude_dirs.txt --exclude-files exclude_files.txt -o ./data/project_files.txt ../"

In [568]:
run_command(call_1)
run_command(call_2)
run_command(call_3)

Saving the results

In [24]:
# read project files, and only keep md, txt, wsd files
# the file is in key-value format, where the key is the file name and the value is the content
with open("./data/project_files.txt", "r") as f:
    file_dict = json.load(f)

for key in list(file_dict.keys()):
    if key.endswith(('requirements.txt', 'ohjeistusta.md')):
        del file_dict[key]
    if not key.endswith(('.md', '.txt', '.wsd')):
        del file_dict[key]

with open("./data/raw_project_documents.json", "w") as f:
    json.dump(file_dict, f, indent=4)

In [25]:
# This requires that raw_project_documents.json already exists
with open("./data/raw_project_documents.json", "r", encoding="utf-8") as f:
    file_dict = json.load(f)

documents_json = []
for file_path, content in file_dict.items():
    doc_type = "markdown document" if file_path.endswith(".md") else "document"
    documents_json.append({
        "file": file_path,
        "type": doc_type,
        "content": content
    })

with open("./data/project_documents.json", "w", encoding="utf-8") as f:
    json.dump(documents_json, f, ensure_ascii=False, indent=2)

In [26]:
# load the documents
with open("./data/project_documents.json", "r", encoding="utf-8") as f:
    documents_json = json.load(f)

In [27]:
# split to documents and markdown documents
documents_dicts = []
markdown_documents_dicts = []
for doc in documents_json:
    if doc["type"] == "document":
        documents_dicts.append(doc)
    elif doc["type"] == "markdown document":
        markdown_documents_dicts.append(doc)
    else:
        raise ValueError(f"Unknown document type: {doc['type']}")

In [28]:
with open('./data/backend_code.txt', 'r', encoding='utf-8') as f:
    code_data = json.load(f)

In [418]:
formatted_code_entries = [format_code_entry(entry) for entry in code_data]

In [29]:
len(documents_dicts), len(markdown_documents_dicts)

(9, 12)

In [576]:
markdown_documents_dicts[0]

{'file': './App/README.md',
 'type': 'markdown document',
 'content': '# Eprice App\n\nThe Eprice App is a containerized application that allows users to view the market price of electricity in Finland, both current and historical. The app is built with a modern tech stack, including a Svelte frontend, a FastAPI backend, a PostgreSQL database, and various tools for testing and data management.\n\n## Features\n\n- **Electricity Price Viewer**: View current and historical electricity prices in Finland.\n- **Svelte Frontend**: A modern, responsive UI built with Svelte and Vite.\n- **FastAPI Backend**: A Python-based backend for handling API requests and business logic.\n- **PostgreSQL Database**: A robust database for storing electricity price data.\n- **Flyway Migrations**: Manage database schema changes with ease.\n- **Testing**: End-to-end tests with Playwright and backend API tests with Pytest.\n- **Chat Engine**: A chat-based interface for interacting with the app.\n- **Data Loading*

In [577]:
documents_dicts[0]

{'file': './App/python-server/serving_over_net.txt',
 'type': 'document',
 'content': 'PUBLIC_API_URL=http://80.221.17.169:8000\n\n"http://192.168.10.46:5173",\n"http://80.221.17.169:5173",'}

In [31]:
# Load the model
model_name = "BAAI/bge-large-en-v1.5" #"BAAI/bge-small-en"
model = AutoModel.from_pretrained("BAAI/bge-large-en-v1.5")
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
# Create the embeddings object
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

In [32]:
print(f"Model's maximum sequence length: {SentenceTransformer(model_name).max_seq_length}")

Model's maximum sequence length: 512


In [33]:
print(f"Model's embedding dimensionality: {len(embedding_model.embed_query('some random query'))}")

Model's embedding dimensionality: 1024


In [34]:
# Define which headers to split on and their metadata keys
headers_to_split_on = [
    ("#", "Heading 1"),
    ("##", "Sub heading"),
    ("###", "Sub-sub heading"),
]

# Initialize the Markdown splitter
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=512,
    chunk_overlap=50,
)

In [35]:
all_documents = documents_dicts + markdown_documents_dicts
data = []
for doc in all_documents:
    if doc["type"] == "document":
        chunks = text_splitter.split_text(doc["content"])
        for chunk in chunks:
            embedding = embedding_model.embed_query(chunk)
            data.append({
                "file": doc["file"],
                "type": doc["type"],
                "content": chunk,
                "metadata": None,
                "embedding": embedding
            })
    elif doc["type"] == "markdown document":
        chunks = markdown_splitter.split_text(doc["content"])
        for chunk in chunks:
            subchunks = text_splitter.split_text(chunk.page_content)
            for subchunk in subchunks:
                embedding = embedding_model.embed_query(subchunk)
                data.append({
                    "file": doc["file"],
                    "type": doc["type"],
                    "content": subchunk,
                    "metadata": join_metadata(chunk.metadata),
                    "embedding": embedding_model.embed_query(subchunk)
                })



In [36]:
# Convert to Documents
documents = []
embeddings_list = []
for item in data:
    doc = Document(
        page_content=item['content'],
        metadata={
            'file': item['file'],
            'type': item['type'],
            'heading': item['metadata'],
        }
    )
    documents.append(doc)
    embeddings_list.append(np.array(item['embedding'], dtype=np.float32))

embeddings_matrix = np.vstack(embeddings_list)

In [37]:
# Chunk and embed code files, matching doc structure
code_chunks = []
for entry in code_data:
    # Combine docstring and code for context, or just use code
    docstring = entry.get("docstring", "")
    code_text = entry.get("code", "")
    full_text = f"{docstring}\n{code_text}" if docstring else code_text

    # Chunk the code
    chunks = text_splitter.split_text(full_text)
    for chunk in chunks:
        # Prepare metadata: include all keys except file, type, code, docstring, and start_line
        metadata = {k: v for k, v in entry.items() if k not in ["file", "type", "code", "docstring", "start_line"]}
        doc = Document(
            page_content=chunk,
            metadata={
                "file": entry.get("file", ""),
                "type": entry.get("type", ""),
                "metadata": metadata if metadata else None
            }
        )
        embedding = embedding_model.embed_query(chunk)
        code_chunks.append((doc, embedding))

# Unpack for later use
code_documents = [doc for doc, _ in code_chunks]
code_embeddings_list = [np.array(emb, dtype=np.float32) for _, emb in code_chunks]
code_embeddings_matrix = np.vstack(code_embeddings_list)

In [38]:
# combine code and other documents
documents.extend(code_documents)
embeddings_matrix = np.vstack((embeddings_matrix, code_embeddings_matrix))

In [39]:
dimension = 1024#embeddings_matrix.shape[1]
index = faiss.IndexFlatL2(dimension)  # or another index type

# Add embeddings to the index
index.add(embeddings_matrix)

# Create docstore dict mapping index IDs to Document objects
# Create InMemoryDocstore wrapping your documents dict
docstore = InMemoryDocstore({i: doc for i, doc in enumerate(documents)})
index_to_docstore_id = {i: i for i in range(len(documents))}

# Correct FAISS vector store initialization
vector_store = FAISS(
    embedding_function=None,  # embeddings already computed
    index=index,              # FAISS index object here
    docstore=docstore,
    index_to_docstore_id=index_to_docstore_id,
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [None]:
connection_string = "postgresql+psycopg://username:password@localhost:5432/database"
collection_name = "document_collection"
vector_store = PGVector(
    embeddings=embedding_model,
    collection_name=collection_name,
    connection=connection_string,
)
vector_store.add_documents(documents=documents)

In [42]:
query = "Can you describe the App"
query_embedding = embedding_model.embed_query(query)  # get embedding for query

In [44]:
results = vector_store.similarity_search_by_vector(query_embedding, k=5)

for doc in results:
    print(f"File: {doc.metadata['file']}")
    if "Heading" in doc.metadata:
        print(f"Heading: {doc.metadata['heading']}")
    print(f"Content: {doc.page_content}...\n")

File: ./Documents/diagrams/sources/use_case.wsd
Content: @startuml
title Use Case Diagram for Electricity Market App

actor "Signed-in User" as User
actor "Visitor" as Visitor

package "Frontend (Svelte)" {
    usecase "View Current Electricity Prices" as ViewPrices
    usecase "Request Historical/predicted Data" as RequestHistorical
    usecase "Chat with LLM" as ChatWithLLM
}

package "Database" {
    usecase "Store and Retrieve Cached Data" as CacheDB
    usecase "Store and Retrieve User Data\n(Auth service only)" as UserDB
    usecase "Text chunks and\nvector embeddings" as VectorDB
}

package "LLM" {
    usecase "Embed Query and Search Vector DB" as EmbedSearch
    usecase "Format prompt with query\nand context" as PromptLLM
    usecase "LLM engine" as LLMengine
}

package "Backend (FastAPI)" {
    usecase "Fetch Current Prices" as FetchPrices
    usecase "Fetch Data" as FetchData
    usecase "Check Cache" as CheckCache
    usecase "Retrieve Data from External APIs\nand chache" as

In [45]:
retriever = vector_store.as_retriever(search_kwargs={"k": 10})  # retrieve top 20 docs

In [None]:
class MyStreamingHandler(BaseCallbackHandler):
    def on_llm_new_token(self, token: str, **kwargs):
        print(token, end="", flush=True)  # or handle token as you want

class LLMReranker(BaseDocumentCompressor):
    llm_chain: object = Field(LLMChain, description="LLM chain to rerank documents")
    document_variable_name: str = "document"

    def compress_documents(
        self,
        documents: Sequence[Document],
        query: str,
        *,
        callbacks: Optional[list] = None,
    ) -> List[Document]:
        
        scored_docs = []
        for doc in documents:
            inputs = {
                "query": query,
                self.document_variable_name: doc.page_content,
            }
            output = self.llm_chain.run(inputs)
            try:
                score = int(output.strip())
            except Exception:
                score = 0
            scored_docs.append((doc, score))
        scored_docs.sort(key=lambda x: x[[1]], reverse=True)
        return [doc for doc, score in scored_docs]
    
class LLMRerankerBatched(BaseDocumentCompressor):
    """ LLM Reranker that uses LLMChain to rerank documents in batches.
    It passes the documents to the LLM in a single call and expects the LLM to return a list of scores.
    """
    llm_chain: object = Field(LLMChain, description="LLM chain to rerank documents")
    document_variable_name: str = "document"

    def compress_documents(
        self,
        documents: Sequence[Document],
        query: str,
        *,
        callbacks: Optional[list] = None,
    ) -> List[Document]:
        
        scored_docs = []
        

In [None]:
# Initialize OpenAI chat model -- this is used for reranking
llm_reranker = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# Define a template for reranking
rerank_prompt = PromptTemplate(
    input_variables=["query", "document"],
    template=(
        "Given the query:\n{query}\n\n"
        "Rate the relevance of the following document to the query on a scale from 1 to 10:\n"
        "{document}\n\n"
        "Only output the score as an integer."
    ),
)
rerank_batch_prompt = PromptTemplate(
    input_variables=["query", "documents"],
    template=(
        "Given the query:\n{query}\n\n"
        "Rate the relevance of the following documents to the query on a scale from 1 to 10:\n"
        "{documents}\n\n"
        "Only output the scores as a list of integers."
    ),
)

llm_streaming = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.5,
    streaming=True,
    callbacks=[MyStreamingHandler()],
)
llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0.5,
    streaming=False
)

In [None]:
# Create an LLMChain for reranking
#rerank_chain = LLMChain(llm=llm_reranker, prompt=rerank_prompt)
# Instantiate your reranker
#reranker = LLMReranker(llm_chain=rerank_chain)

rerank_chain = LLMChain(llm=llm_streaming, prompt=rerank_batch_prompt)
reranker = LLMRerankerBatched(llm_chain=rerank_chain)

# Wrap your base retriever with ContextualCompressionRetriever using the reranker
reranking_retriever = ContextualCompressionRetriever(
    base_retriever=retriever,
    base_compressor=reranker,
)

ValidationError: 1 validation error for ContextualCompressionRetriever
base_compressor
  Input should be a valid dictionary or instance of BaseDocumentCompressor [type=model_type, input_value=<function <lambda> at 0x70e8e02b1620>, input_type=function]
    For further information visit https://errors.pydantic.dev/2.11/v/model_type

In [63]:
rerank_chain.invoke({
    "query": "What is the purpose of the App?",
    "document": "This is a sample document content."
})

{'query': 'What is the purpose of the App?',
 'document': 'This is a sample document content.',
 'text': '1'}

In [60]:
prompt_template = ChatPromptTemplate.from_template(
    "You are an expert assistant. Use the following project documents to answer the user's question."
    "If the answer is in the documents, provide it and reference the document(s) used."
    "If the answer is not in the documents, provide a general answer based on your knowledge,"
    "and state that the documents do not contain the answer.\n\n"
    "Context:\n"
    "{context}\n\n"
    "Question:\n"
    "{question}\n\n"
)
def full_chain(query: str):
    query_embedding = embedding_model.embed_query(query)
    reranked_docs = reranking_retriever.base_retriever.vectorstore.similarity_search_by_vector(query_embedding, k=5)
    context = format_documents(reranked_docs)
    response = llm.invoke(prompt_template.format(context=context, question=query))
    return response.content


In [56]:
prompt_template = ChatPromptTemplate.from_template(
    "You are an expert assistant. Use the following project documents to answer the user's question."
    "If the answer is in the documents, provide it and reference the document(s) used."
    "If the answer is not in the documents, provide a general answer based on your knowledge,"
    "and state that the documents do not contain the answer.\n\n"
    "Context:\n"
    "{context}\n\n"
    "Question:\n"
    "{question}\n\n"
)
async def full_chain_stream(query: str):
    # Embed query and retrieve reranked docs as before
    query_embedding = embedding_model.embed_query(query)
    reranked_docs = reranking_retriever.base_retriever.vectorstore.similarity_search_by_vector(query_embedding, k=5)
    context = format_documents(reranked_docs)

    # Prepare prompt input
    prompt_text = prompt_template.format(context=context, question=query)

    # Stream tokens asynchronously from the LLM
    async for token in llm.astream(prompt_text):
        yield token.content  # yield each token as it arrives

In [None]:
message_history = InMemoryChatMessageHistory()

prompt_template = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                "You are an expert assistant. Use the following project documents to answer the user's question. "
                "If the answer is in the documents, provide it and reference the document(s) used. "
                "If the answer is not in the documents, provide a general answer based on your knowledge, "
                "and state that the documents do not contain the answer."
            )
        ),
        MessagesPlaceholder(variable_name="history"),
        ("user", "{question}"),
    ]
)
def full_chain_with_history(query: str):
    # Embed and retrieve documents as before
    query_embedding = embedding_model.embed_query(query)
    reranked_docs = reranking_retriever.base_retriever.vectorstore.similarity_search_by_vector(query_embedding, k=5)
    context = format_documents(reranked_docs)

    # Add the context as a system message or part of the prompt
    # Here we prepend context as a system message for clarity
    system_msg = SystemMessage(content=f"Context:\n{context}")

    # Get current history messages and append the system context message
    history_msgs = message_history.messages + [system_msg]

    # Add the current user query as a HumanMessage
    user_msg = HumanMessage(content=query)

    # Prepare messages for the LLM: history + current user message
    messages = history_msgs + [user_msg]

    # Invoke the LLM with the messages
    response = llm.invoke(messages)

    # Add user and AI messages to history
    message_history.add_user_message(query)
    message_history.add_ai_message(response.content)

    return response.content

async def full_chain_with_history_stream(query: str):
    # Embed and retrieve documents as before
    query_embedding = embedding_model.embed_query(query)
    reranked_docs = reranking_retriever.base_retriever.vectorstore.similarity_search_by_vector(query_embedding, k=5)
    context = format_documents(reranked_docs)

    # Add the context as a system message or part of the prompt
    system_msg = SystemMessage(content=f"Context:\n{context}")

    # Get current history messages and append the system context message
    history_msgs = message_history.messages + [system_msg]

    # Add the current user query as a HumanMessage
    user_msg = HumanMessage(content=query)

    # Prepare messages for the LLM: history + current user message
    messages = history_msgs + [user_msg]

    # Stream tokens asynchronously from the LLM
    async for token in llm.astream(messages):
        yield token.content  # yield each token as it arrives

In [61]:
full_chain("Can you describe the app?")

'The Eprice App is a containerized application designed to allow users to view both current and historical market prices of electricity in Finland. It features a modern tech stack that includes:\n\n- **Frontend**: Built with Svelte and Vite, providing a responsive user interface.\n- **Backend**: Developed using FastAPI, which handles API requests and business logic.\n- **Database**: Utilizes PostgreSQL for robust data storage of electricity price information.\n- **Flyway Migrations**: Facilitates easy management of database schema changes.\n- **Testing**: Incorporates end-to-end testing using Playwright and backend API testing with Pytest.\n- **Chat Engine**: Offers a chat-based interface for user interaction.\n- **Data Loading**: Includes scripts for loading and updating electricity price data into the database.\n\nOverall, the app combines various components to deliver a comprehensive platform for monitoring electricity prices in Finland. \n\nThis information is derived from the READ

In [62]:
response = ""
async for resp in full_chain_stream("Can you describe the App?"):
    response += resp
    print(resp, end="", flush=True)  # Print each token as it arrives    

The Eprice App is a containerized application designed to allow users to view both current and historical market prices of electricity in Finland. It utilizes a modern tech stack that includes:

- **Frontend**: Built with Svelte and Vite, providing a modern and responsive user interface.
- **Backend**: Developed using FastAPI, which handles API requests and business logic.
- **Database**: Utilizes PostgreSQL for robust storage of electricity price data.
- **Data Management**: Employs Flyway migrations to manage database schema changes easily.
- **Testing**: Incorporates end-to-end testing with Playwright and backend API testing with Pytest.
- **Chat Engine**: Features a chat-based interface for user interaction.
- **Data Loading**: Includes scripts for loading and updating electricity price data into the database using a dedicated container.

Overall, the app provides a comprehensive solution for monitoring electricity prices, leveraging modern technologies for a seamless user experien

In [59]:
response

'The purpose of the App, as outlined in the project documents, includes several functionalities:\n\n1. It can be used independently to retrieve or update data, saving data to a location that is available to migrations and the database.\n2. It runs end-to-end tests for the system.\n3. It handles database schema migrations.\n4. It stores application data, including user information and embeddings for retrieval.\n\nThese purposes are detailed in the document titled "Application Overview" found in the project description.'