In [33]:
import json
import sys
sys.path.append("../")
from utils.db_calls import save_code_to_db, save_documents_to_db, save_code_to_db
from format_documents import filter_files_by_extension, format_all_docs_json, format_documents_json

from format_code import load_code_data, format_code_chunks
from langchain_core.documents import Document  # if you want Document objects
from langchain_text_splitters import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

from transformers import AutoModel, AutoTokenizer
import torch

In [21]:
# Load your file_dict from file or previous step
with open("../data/project_files.txt", "r") as f:
    file_dict = json.load(f)

# Filter files
filtered_files = filter_files_by_extension(file_dict)

# Format for all docs (complete files)
all_docs_json = format_all_docs_json(file_dict)

# Format for documents (for embedding/splitting)
documents_json = format_documents_json(filtered_files)

# Load code data
code_data = load_code_data("../data/backend_code.txt")

In [None]:
# NOTE: These can be used for retrieval as tool calls
#save_files_to_db(all_docs_json)
#save_documents_to_db(all_documents, embedding_model)
#save_code_to_db(code_data, embedding_model)

In [None]:
# Load the model
model_name = "BAAI/bge-small-en" # "BAAI/bge-large-en-v1.5"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Create the embeddings object
embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs={"device": "cuda" if torch.cuda.is_available() else "cpu"},
    encode_kwargs={"normalize_embeddings": True},
)

In [34]:
# define text splitters
headers_to_split_on = [
    ("#", "Heading 1"),
    ("##", "Sub heading"),
    ("###", "Sub-sub heading"),
]

# Initialize the Markdown splitter
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer,
    chunk_size=512,
    chunk_overlap=50,
)

In [29]:
# Format code chunks (returns list of (Document, embedding) tuples)
code_chunks = format_code_chunks(
    code_data,
    text_splitter,           # your text splitter instance
    embedding_model,         # your embedding model
    doc_class=Document       # optional, for langchain Document objects
)

# Unpack if needed
code_documents = [doc for doc, _ in code_chunks]
code_embeddings = [emb for _, emb in code_chunks]

In [None]:
# The code and documents can be saved to the database for retrieval
# but it requires some additional parsing to match the format