In [1]:
from parse_code import *
from parse_files import *
from parse_contents import *
import psycopg
import dotenv

import json
import os
from helpers import *

from transformers import AutoModel, AutoTokenizer

from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_core.documents import Document
from langchain_postgres import PGVector

import numpy as np
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
# Load environment variables from .env file
dotenv.load_dotenv("../.env.private")

Using device: cuda


True

### Parsing the project

This is done using a bash script `parse_project.sh` in `Eprice/Notebooks/document_parsing`. Alternatively, you can also uncomment the 2 blocks after this, and do the parsing directly here.

**NOTE: The subsequent code presumes that there is a folder `../data/` with the following files:**

* `backend_code.txt` (all python-server code parsed with LSP or similar -- here we use `jedi` and `ast`)
* `project_files.txt` (all project files as they are, with filename and file content)
* `project_structure.txt` (unix style printout of the project's directory structure)

In [4]:
# system calls -- I wrote some cmdline scripts to do the parsing
# call_1 = "uv run parse_code.py ../../App/python-server/ --replace-source ../../App --replace-target ./App -o ../data/backend_code.txt -e ../data/exclude_imports.txt"
# call_2 = "uv run parse_contents.py -d ../../ -r --exclude-dirs ../data/exclude_dirs.txt --exclude-files ../data/exclude_files.txt -o ../data/project_structure.txt"
# call_3 = "uv run parse_files.py --exclude-dirs ../data/exclude_dirs.txt --exclude-files ../data/exclude_files.txt -o ../data/project_files.txt ../../"

In [None]:
# run_command(call_1)
# run_command(call_2)
# run_command(call_3)

**Read project files, and only keep md, txt, wsd files.**

In [5]:
with open("../data/project_files.txt", "r") as f:
    file_dict = json.load(f)
    all_docs = file_dict.copy()

for key in list(file_dict.keys()):
    if not key.endswith(('.md', '.txt', '.wsd')):
        del file_dict[key]

In [None]:
# save for debugging
#with open("./data/raw_project_documents.json", "w") as f:
#    json.dump(file_dict, f, indent=4)

In [6]:
# load data from file
# with open("./data/raw_project_documents.json", "r", encoding="utf-8") as f:
#     file_dict = json.load(f)


**Add generic type -- this is for compatibility with another project, and not of any real use here**

In [6]:
all_docs_json = []
for key, value in all_docs.items():
    all_docs_json.append({
        "type": "complete_file",
        "name": key,
        "content": value
    })

In [7]:
documents_json = []
for file_path, content in file_dict.items():
    doc_type = "markdown document" if file_path.endswith(".md") else "document"
    documents_json.append({
        "file": file_path,
        "type": doc_type,
        "content": content
    })

In [None]:
# save for debugging
#with open("./data/project_documents.json", "w", encoding="utf-8") as f:
#    json.dump(documents_json, f, ensure_ascii=False, indent=2)

# load the documents
# with open("./data/project_documents.json", "r", encoding="utf-8") as f:
#     documents_json = json.load(f)

**Split to documents and markdown documents.**

In [8]:
documents_dicts = []
markdown_documents_dicts = []
for doc in documents_json:
    if doc["type"] == "document":
        documents_dicts.append(doc)
    elif doc["type"] == "markdown document":
        markdown_documents_dicts.append(doc)
    else:
        raise ValueError(f"Unknown document type: {doc['type']}")

**Read the code data.**

In [9]:
with open('../data/backend_code.txt', 'r', encoding='utf-8') as f:
    code_data = json.load(f)

In [12]:
# formatted_code_entries = [format_code_entry(entry) for entry in code_data]
# formatted_code_entries

In [10]:
len(documents_dicts), len(markdown_documents_dicts), len(code_data), len(all_docs_json)

(10, 15, 120, 123)

### Truncate the existing tables inthe database

The existing tables we are going to update are:

* `documents` (docs with embeddings)
* `code` (code with embeddings)
* `files` (all files as they are)

We want to ***truncate these tables, not drop them***. In addition, there are langchain collections that need to be dropped (with cascade):

* `langchain_pg_collection`
* `langchain_pg_embedding`

In [13]:
def truncate_table(table_name):
    """
    Truncate (remove all rows from) a table.
    Args:
        table_name (str): Name of the table to truncate.
    """
    conn = psycopg.connect(
        dbname=os.getenv("PGDATABASE"),
        user=os.getenv("PGUSER"),
        password=os.getenv("PGPASSWORD"),
        host=os.getenv("PGHOST"),
        port=os.getenv("PGPORT")
    )
    cur = conn.cursor()
    cur.execute(f"TRUNCATE TABLE {table_name} RESTART IDENTITY CASCADE;")
    conn.commit()
    cur.close()
    conn.close()

def drop_table(table_name, cascade=False):
    """
    Drop a table from the database.
    Args:
        table_name (str): Name of the table to drop.
        cascade (bool): Whether to use CASCADE option.
    """
    conn = psycopg.connect(
        dbname=os.getenv("PGDATABASE"),
        user=os.getenv("PGUSER"),
        password=os.getenv("PGPASSWORD"),
        host=os.getenv("PGHOST"),
        port=os.getenv("PGPORT")
    )
    cur = conn.cursor()
    sql = f"DROP TABLE IF EXISTS {table_name} {'CASCADE' if cascade else ''};"
    cur.execute(sql)
    conn.commit()
    cur.close()
    conn.close()

In [14]:
truncate_table("documents")
truncate_table("files")
truncate_table("code")

In [16]:
drop_table("langchain_pg_collection", cascade=True)
drop_table("langchain_pg_embedding")

### Prepare the model for embedding the code

I am using a small model for docstring embeddings.

In [17]:
# Load the model
model_name = "BAAI/bge-small-en" # "BAAI/bge-large-en-v1.5"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Create the embeddings object
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

In [None]:
#print(f"Model's maximum sequence length: {SentenceTransformer(model_name).max_seq_length}")
#print(f"Model's embedding dimensionality: {len(embedding_model.embed_query('some random query'))}")

Model's maximum sequence length: 512
Model's embedding dimensionality: 384


In [None]:
# save_code_to_db(code_data, embedding_model)

In [None]:
# all_documents = documents_dicts + markdown_documents_dicts
# save_documents_to_db(all_documents, embedding_model)

In [None]:
# save_files_to_db(all_docs_json)

## Splitting the documents for embedding

In [22]:
# define text splitters
headers_to_split_on = [
    ("#", "Heading 1"),
    ("##", "Sub heading"),
    ("###", "Sub-sub heading"),
]

# Initialize the Markdown splitter
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=512,
    chunk_overlap=50,
)

In [23]:
# chunk and embed the documents
all_documents = documents_dicts + markdown_documents_dicts
data = []
for doc in all_documents:
    if doc["type"] == "document":
        chunks = text_splitter.split_text(doc["content"])
        for chunk in chunks:
            embedding = embedding_model.embed_query(chunk)
            data.append({
                "file": doc["file"],
                "type": doc["type"],
                "content": chunk,
                "metadata": None,
                "embedding": embedding
            })
    elif doc["type"] == "markdown document":
        chunks = markdown_splitter.split_text(doc["content"])
        for chunk in chunks:
            subchunks = text_splitter.split_text(chunk.page_content)
            for subchunk in subchunks:
                embedding = embedding_model.embed_query(subchunk)
                data.append({
                    "file": doc["file"],
                    "type": doc["type"],
                    "content": subchunk,
                    "metadata": join_metadata(chunk.metadata),
                    "embedding": embedding_model.embed_query(subchunk)
                })

# Convert to Documents
documents = []
embeddings_list = []
for item in data:
    doc = Document(
        page_content=item['content'],
        metadata={
            'file': item['file'],
            'type': item['type'],
            'heading': item['metadata'],
        }
    )
    documents.append(doc)
    embeddings_list.append(np.array(item['embedding'], dtype=np.float32))

embeddings_matrix = np.vstack(embeddings_list)

In [24]:
# Chunk and embed code files, matching doc structure
code_chunks = []
for entry in code_data:
    # Combine docstring and code for context, or just use code
    docstring = entry.get("docstring", "")
    code_text = entry.get("code", "")
    full_text = f"{docstring}\n{code_text}" if docstring else code_text

    # Chunk the code
    chunks = text_splitter.split_text(full_text)
    for chunk in chunks:
        # Prepare metadata: include all keys except file, type, code, docstring, and start_line
        metadata = {k: v for k, v in entry.items() if k not in ["file", "type", "code", "docstring", "start_line"]}
        doc = Document(
            page_content=chunk,
            metadata={
                "file": entry.get("file", ""),
                "type": entry.get("type", ""),
                "metadata": metadata if metadata else None
            }
        )
        embedding = embedding_model.embed_query(chunk)
        code_chunks.append((doc, embedding))

# Unpack for later use
code_documents = [doc for doc, _ in code_chunks]
code_embeddings_list = [np.array(emb, dtype=np.float32) for _, emb in code_chunks]
code_embeddings_matrix = np.vstack(code_embeddings_list)

The embeddings are pre-calculated only if we choose to use something like FAISS. If we use our own postgres database, we calculate the embeddings at the same time we save the docs.

In [25]:
# combine code and other documents
documents.extend(code_documents)
embeddings_matrix = np.vstack((embeddings_matrix, code_embeddings_matrix))

In [26]:
len(documents)

249

In [27]:
len(code_documents),len(documents)

(144, 249)

#### the database has to be up and running for this to work

In [28]:
connection_string = "postgresql+psycopg://username:password@localhost:5432/database"
collection_name = "project_documents"
vector_store = PGVector(
    embeddings=embedding_model,
    collection_name=collection_name,
    connection=connection_string,
)


In [30]:
# add documents to the vector store (if not already added)
# vector_store.add_documents(documents=documents)

### Test how the vector store works

In [23]:
query = "Frontend implementation?"
query_embedding = embedding_model.embed_query(query)  # get embedding for query
results = vector_store.similarity_search_by_vector(query_embedding, k=10)

for doc in results:
    print(f"File: {doc.metadata['file']}")
    if "Heading" in doc.metadata:
        print(f"Heading: {doc.metadata['heading']}")
    print(f"Content: {doc.page_content}...\n")

File: ./Documents/backend_design.md
Content: A Svelte frontend can use form actions and server-side calls for sensitive or restricted operations. When a user submits a form (such as login or register), the request is sent to the FastAPI backend, which processes it through the controller-service-repository pipeline. This ensures data is validated, business rules are enforced, and database operations are performed securely....

File: ./Documents/backend_design.md
Content: ```plantuml
@startuml
[Svelte Form] --> [Controller (FastAPI Route)]
[Controller (FastAPI Route)] --> [Service (Business Logic)]
[Service (Business Logic)] --> [Repository (Database Access)]
[Repository (Database Access)] --> [Database]
@enduml
```  
This pattern helps build robust, maintainable, and secure web applications, especially when handling authentication and other sensitive operations....

File: ./App/README.md
Content: 1. Frontend (Client)
Built with Svelte and Vite.
Located in the client/ directory.
See clie

# ------------------------------------------------------------------------------

### Next

Now we can simply load the vector store in the chat engine. We can also define a separate interface to retrieve code (whether by embeddings or by name)

In [None]:
# Load the model
model_name = "BAAI/bge-small-en" # "BAAI/bge-large-en-v1.5"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name) # maybe needed for token counting
# Create the embeddings object
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

In [None]:
connection_string = "postgresql+psycopg://username:password@localhost:5432/database"
collection_name = "project_documents"
vector_store = PGVector(
    embeddings=embedding_model,
    collection_name=collection_name,
    connection=connection_string,
)


In [26]:
all_files = get_all_files()

In [30]:
docs=load_documents_from_db()

In [33]:
res=load_code_from_db()

In [38]:
class CodeEmbeddingRetriever:
    def __init__(self, n=5):
        self.n = n

    def retrieve(self, query_embedding, n=None):
        """
        Retrieve the n most similar code entries from the database using an embedded query vector.
        Args:
            query_embedding (list or np.ndarray): The query embedding vector.
            n (int, optional): Number of results to return. Defaults to self.n.
        Returns:
            List[dict]: List of code entries with metadata and code.
        """
        if not isinstance(query_embedding, list):
            query_embedding = list(map(float, np.array(query_embedding).flatten()))
        if n is None:
            n = self.n

        conn = psycopg.connect(
            dbname=os.getenv("PGDATABASE"),
            user=os.getenv("PGUSER"),
            password=os.getenv("PGPASSWORD"),
            host=os.getenv("PGHOST"),
            port=os.getenv("PGPORT")
        )
        cur = conn.cursor()
        cur.execute("""
            SELECT file, type, name, docstring, code
            FROM code
            ORDER BY embedding <-> %s::vector
            LIMIT %s
        """, (query_embedding, n))

        rows = cur.fetchall()
        code_data = []
        for row in rows:
            code_data.append({
                "file": row[0],
                "type": row[1],
                "name": row[2],
                "docstring": row[3],
                "code": row[4],
            })

        cur.close()
        conn.close()
        return code_data

    def __call__(self, query_embedding, n=None):
        return self.retrieve(query_embedding, n)

    def invoke(self, query_embedding, n=None):
        return self.retrieve(query_embedding, n)

# Aliases for convenience
CodeEmbeddingRetriever.__invoke__ = CodeEmbeddingRetriever.invoke
CodeEmbeddingRetriever.__call__ = CodeEmbeddingRetriever.__call__

In [33]:
get_file_by_name("./Documents/openapi_endpoint_descriptions.md")

{'name': './Documents/openapi_endpoint_descriptions.md',
 'type': 'complete_file',
 'content': '# Eprice API Endpoint Overview\n\nThis document provides a concise technical overview of the main API endpoints exposed by the Eprice backend, as described in the OpenAPI specification.  \nThe API is organized into **public data endpoints** (for electricity, etc.) and **authentication endpoints**.  \nAll other endpoints require authentication via JWT.\n\nThe API is designed for both public and authenticated use. Public endpoints provide enough data for basic electricity price queries and user authentication, while authenticated endpoints allow access to more detailed or user-specific data.\n\n- The use of POST for range queries (instead of GET with query parameters) allows for more complex request bodies and easier extension in the future.\n- The API is well-structured for integration with frontend applications and external systems.\n\n---\n\n## Public Endpoints\n\nThese endpoints are access

In [34]:
query = "how to get porssisahko data from the database"
retrieve_similar_code(embedding_model.embed_query(query), n=5)

[{'file': './App/python-server/scheduled_tasks/porssisahko_scheduler.py',
  'type': 'function',
  'name': 'scheduled_tasks.porssisahko_scheduler.fetch_and_insert_porssisahko_data_sync',
  'docstring': 'Synchronous wrapper to run fetch_and_insert_porssisahko_data in an event loop.',
  'start_line': 111,
  'code': 'def fetch_and_insert_porssisahko_data_sync():\n    """\n    Synchronous wrapper to run fetch_and_insert_porssisahko_data in an event loop.\n    """\n    asyncio.run(fetch_and_insert_porssisahko_data())',
  'embedding': '[-0.0541071,0.0184408,-0.018578207,-0.005205048,-0.014615599,-0.014795278,0.0021631205,-0.029739203,0.02542764,-0.0024358905,0.003369776,-0.02468088,0.032465126,0.00014488037,0.0018276252,0.0008100931,-0.032298207,0.0343139,-0.026042046,0.018603347,0.0075262203,-0.02725544,-0.03283639,-0.0594523,-0.030710492,0.040958803,-0.04205632,-0.04524875,-0.020730289,-0.17762847,0.003269431,-0.04496451,-0.0045276033,-0.013060949,0.04402135,-0.019571584,-0.016268227,0.0577

In [39]:
retriever = CodeEmbeddingRetriever(n=5)

In [40]:
retriever.retrieve(embedding_model.embed_query("database access"), n=5)

[{'file': './App/python-server/repositories/user_repository.py',
  'type': 'function',
  'name': 'repositories.user_repository.UserRepository.__init__',
  'docstring': 'Initialize the UserRepository with a database connection URL.\n\nArgs:\n    database_url (str): The database connection URL.',
  'code': '    def __init__(self, database_url: str):\n        """\n        Initialize the UserRepository with a database connection URL.\n\n        Args:\n            database_url (str): The database connection URL.\n        """\n        self.database_url = database_url'},
 {'file': './App/python-server/repositories/user_repository.py',
  'type': 'class',
  'name': 'repositories.user_repository.UserRepository',
  'docstring': 'Repository class for user-related database operations in the Eprice backend.\n\nProvides asynchronous methods for retrieving, creating, and updating user records,\nas well as verifying user email addresses. Interacts directly with the PostgreSQL\ndatabase using asyncpg.\n