In [2]:
repo_name = "hima12-awny/read-csv-dataframe-cpp"

In [13]:
from llama_index.core import SimpleDirectoryReader
import os
import json
from repo_cloner import RepoCloner


def load_docs(repo_name):

    repo_info = RepoCloner.get_repo_info(repo_name=repo_name)
    repo_url = repo_info['repo_url']
    last_commit_hash = repo_info['commit_hash']
    last_updated = repo_info['last_updated']

    with open("supported_files.json", 'r') as sf:
        supported_files_types = json.loads(sf.read())

    with open("supported_files_names.json", 'r') as sf:
        supported_files_types_names = json.loads(sf.read())

    repo_path = os.path.join(".", "repos", repo_name)

    reader = SimpleDirectoryReader(
        input_dir=repo_path,
        recursive=True,
        required_exts=supported_files_types
    )

    docs = reader.load_data(num_workers=4)

    for i in range(len(docs)):
        doc = docs[i]

        doc_last_path = doc.metadata['file_path'].replace(
            '\\', '/').split(repo_name)[-1]

        doc.metadata['file_url'] = f'{repo_url}/blob/{last_commit_hash}{doc_last_path}'
        doc.metadata['last_updated'] = last_updated

        docs[i] = doc

    return docs


docs = load_docs(repo_name)

In [14]:
doc = docs[20]

In [15]:
doc.metadata

{'file_path': 'k:\\python\\streamlit\\RAG_Chatbots\\documentation_chatbot\\repos\\hima12-awny\\read-csv-dataframe-cpp\\tests.cpp',
 'file_name': 'tests.cpp',
 'file_type': 'text/plain',
 'file_size': 18375,
 'creation_date': '2025-03-10',
 'last_modified_date': '2025-03-10',
 'file_url': 'https://github.com/hima12-awny/read-csv-dataframe-cpp/blob/32aa37edc5a2f93015e8088651f225d6dc997b24/tests.cpp',
 'last_updated': '2025-03-11T21:36:51.534078'}

In [54]:
doc.metadata['file_path'].replace(
    '\\', '/').split(repo_name)[-1]

'/pandascpp/columnDs/column.cpp'

In [21]:
documents[0].model_dump()

{'id_': '19eefe3d-7908-416f-b865-eed2392d6fa9',
 'embedding': None,
 'metadata': {'file_path': 'k:\\python\\streamlit\\RAG_Chatbots\\documentation_chatbot\\repos\\hima12-awny\\read-csv-dataframe-cpp\\pandascpp\\columnDs\\column.cpp',
  'file_name': 'column.cpp',
  'file_type': 'text/plain',
  'file_size': 45278,
  'creation_date': '2025-03-10',
  'last_modified_date': '2025-03-10'},
 'excluded_embed_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'excluded_llm_metadata_keys': ['file_name',
  'file_type',
  'file_size',
  'creation_date',
  'last_modified_date',
  'last_accessed_date'],
 'relationships': {},
 'metadata_template': '{key}: {value}',
 'metadata_separator': '\n',
 'text_resource': {'embeddings': None,
  'text': '#include "../columnDs/column.h"\r\n#include "../tableclass/Table.h"\r\n\r\n\r\n// Getter/Setter Attributes\r\n\r\nvoid column::set_name(string name)\r\n{\r\n\tthis->name = name;\r\n\tin

In [9]:
from langchain_text_splitters import (
    Language,
    RecursiveCharacterTextSplitter,
)

In [None]:
splitted_docs = []
for doc in documents:
    file_name = doc.metadata['file_name']
    file_ex = '.' + doc.metadata['file_name'].split('.')[-1]

    if file_ex == '.txt':
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=1024, chunk_overlap=50)

    else:
        code_lang = supported_files_names[file_ex]
        splitter = RecursiveCharacterTextSplitter.from_language(
            language=Language(code_lang), chunk_size=1024, chunk_overlap=50
        )

    code_text = doc.text

    file_docs = splitter.create_documents([code_text])

    splitted_docs.extend(
        dict(
            file_name=file_name,
            doc=doc.page_content
        )
        for doc in file_docs
    )

In [1]:
from vecdb_modules.vecdbv2 import VecDB
repo_name = "hima12-awny/read-csv-dataframe-cpp"
vecdb = VecDB(
    repo_name=repo_name
)
# vecdb.load_vecdb()

In [2]:
vecdb.vectorize_db()

Parsing nodes:   0%|          | 0/217 [00:00<?, ?it/s]

Generating embeddings: 100%|██████████| 22/22 [00:06<00:00,  3.16it/s]


VecDB Storing Done.


In [None]:
query = "what the difference between append and appendPushBack in column data structure?"

In [4]:
res = vecdb.query(query)
res = [
    dict(
        metadata=node.metadata,
        context=node.text
    )
    for node in res
]

VecDB Loading Done.


In [5]:
res[0]

{'metadata': {'source': 'column.cpp',
  'source_url': 'https://github.com/hima12-awny/read-csv-dataframe-cpp/blob/32aa37edc5a2f93015e8088651f225d6dc997b24/pandascpp/columnDs/column.cpp',
  'source_last_updated': '2025-03-11 22:56:19'},
 'context': 'void column::appendPushBack(Object val)\r\n{\r\n\tDtype tempType = val.type;\r\n\r\n\tif (size == 0) {\r\n\t\tvalues.push_back(val);\r\n\t\ttype = tempType;\r\n\t\tsecType = val.secType;\r\n\t}\r\n\telse {\r\n\t\tif (tempType == Dtype::NA) {\r\n\t\t\tvalues.push_back(val);\r\n\t\t}\r\n\t\telse if (mixed_type) {\r\n\t\t\ttype = std::max(tempType, type);\r\n\t\t\tsecType = Object::get_sec_type(type);\r\n\r\n\t\t\tvalues.push_back(val);\r\n\t\t}\r\n\t\telse if (\r\n\t\t\t(tempType != type && type == Dtype::NA) ||\r\n\t\t\t(tempType != type && Object::get_sec_type(tempType) == Dtype::NUMBER)\r\n\t\t\t) {\r\n\r\n\t\t\tvalues.push_back(val);\r\n\r\n\t\t\tif (tempType > type) {\r\n\t\t\t\ttype = tempType;\r\n\t\t\t\tsecType = val.secType;\r\n\t\t\t

In [9]:
res[1]

{'metadata': {'source': 'column.h',
  'source_url': 'https://github.com/hima12-awny/read-csv-dataframe-cpp/blob/32aa37edc5a2f93015e8088651f225d6dc997b24/pandascpp/columnDs/column.h',
  'source_last_updated': '2025-03-11T21:36:51.534078'},
 'context': 'void append(const vector<Object>& vals);\r\n\r\n\t\t// Append Object Value to the end of the values (Supports Mixed Dtype Value Columns).\r\n\t\tvoid appendPushBack(Object val);\r\n\r\n\t\t// Append Multiples Value from vector to The Values Like in Python (Extend)\r\n\t\tvoid appendPushBack(const vector<Object>& vals);\r\n\r\n\t#pragma endregion\r\n\r\n\t#pragma region Print Function\r\n\r\n\t\t// Print The Value Row useful in Printing the Table.\r\n\t\tvoid printAt(int idx);\r\n\r\n\t\t// Print The Ranges Value from index to another.\r\n\t\tvoid p(int from = -1, int to = -1);\r\n\r\n\t\t// Print where NA values exists as Index.\r\n\t\tvoid print_naidxs();\r\n\r\n\t\t// Print The Head Data Range first value to Limits \r\n\t\tvoid phead(in