In [1]:
from app.agent.multi_agent import MultiAgent
from app.agent.agent import ContextTypeAgent, QuestionTypeAgent
from app.prompts.prompts import SIMPLE_VS_COMPLEX, GENERAL_VS_PARTICULAR_CONTEXT
from app.prompts.prompt import Prompt

import asyncio
import nest_asyncio
nest_asyncio.apply()

# simple vs complex -> 0-1 vs > 1 subjects
simple_vs_complex_prompt = Prompt(prompt=SIMPLE_VS_COMPLEX)
simple_vs_complex_agent = QuestionTypeAgent(
    instruction=simple_vs_complex_prompt
)

# general vs particular
general_vs_particular_prompt = Prompt(GENERAL_VS_PARTICULAR_CONTEXT)
general_vs_particular_agent = ContextTypeAgent(
    instruction=general_vs_particular_prompt
    )

multi_agent = MultiAgent(agents=[
    simple_vs_complex_agent, 
    general_vs_particular_agent
])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from app.database.base import get_db
from sqlalchemy.orm import Session

db = next(get_db())

In [3]:
from app.retrievers.general_retriever import GeneralRetriever
from app.retrievers.similarity_retriever import SimilarityRetriever
from app.retrievers.relationships_retriever import RelationShipRetriever


async def tool_pipeline(agent: MultiAgent, query: str, db: Session=db):
    tool, output = await agent.pipeline(query=query)
    tool: GeneralRetriever | SimilarityRetriever = tool(db=db)
    return tool.query_database(query=query, subjects=output.subject)

In [4]:
from app.agent.llama_client import LlamaClient
from app.printer import Printer
from app.prompts.prompts import PROMPT_TO_ANSWER_QUESTIONS

printer = Printer()
query = "How does the function _create_file_node depend on the function _create_nodes_of_file?"
THRESHOLD = 0.25

llm = LlamaClient()

def format_answer(answer: str, max_words: int) -> str:
    answer = answer.replace("\n\n", "\n")
    lines = answer.split("\n")
    processed_answer = " "
    for line in lines:
        words = line.split(" ")
        for k in range(0, len(words), max_words):
            processed_line = " ".join(words[k: k + max_words])
            processed_answer += "\n" + processed_line
    return processed_answer

async def query_pipeline(agent: MultiAgent, 
                         query: str, 
                         llm: LlamaClient, 
                         db: Session, 
                         threshold: float=THRESHOLD) -> str:
    
    tool, output = await agent.pipeline(query=query)
    tool: GeneralRetriever | SimilarityRetriever = tool(db=db)
    nodes, nodes_with_score, relationships = tool.query_database(query=query, subjects=output.subject)
    for node_with_score in nodes_with_score:
        printer.print_blue(f"Score: {node_with_score.score} for text: \n{node_with_score.node.text[:300]}\n")
    filtered_relationships = {}
    if relationships: 
        for relation, relation_nodes in relationships.items():
            for n, rel_node in enumerate(relation_nodes):
                printer.print_blue(f"\tRelationship {n+1} for node --> {relation}: \n{rel_node.text[:150]}\n")
        relationship_retriever = RelationShipRetriever(query=query, nodes=nodes, relationships=relationships)
        filtered_relationships = relationship_retriever.filter_relationships(threshold=threshold)
    context = "\n".join([node.text for node in nodes])
    context += "\n".join([node.text for relation_node in filtered_relationships.values() for node in relation_node]) if len(filtered_relationships) else ""
    prompt = Prompt.format_prompt(prompt=PROMPT_TO_ANSWER_QUESTIONS, context=context, query=query)
    answer = await llm.acall(query=prompt)
    return answer, relationships, filtered_relationships

In [5]:
query = "If I change the parameter depth of the method: __retrieve_relationship_nodes what effect will that have?"
answer, relationships, filterd_relationships = asyncio.run(
    query_pipeline(agent=multi_agent, 
                   query=query, 
                   llm=llm, 
                   db=db)
)

[34mAgent reasoning response: The subject is the method __retrieve_relationship_nodes therefore the answer is simple because there is only one subject[0m
[34mAgent answer: simple 
[0m
[34mAgent reasoning response: The question involves an unspecified effect, which is different than the subject, therefore the answer is general.[0m
[34mAgent answer: general 
[0m
[32mTool decided by the agent: GeneralRetriever[0m




[34m	Exact match of subject: {'__retrieve_relationship_nodes'} in the database. --> 1[0m
[34mScore: 1 for text: 
    def __retrieve_relationship_nodes(self, base_id: str, node: Node, depth: int):
        if base_id == str(node.id) or depth == 0: 
            return [node.id]
        relations = []
        node_relationships = node.node_relationships
        if not node_relationships or not len(node_relationshi
[0m
[34m	Relationship 1 for node --> 3e63f0f0-81cc-4e00-bfe1-bef785b07b94: 
class Node(Base):

    __tablename__ = "node"
    id = Column(UUID(as_uuid=True), primary_key=True, index=True, default=uuid.uuid4)
    node_type = Co
[0m




[34m	Filtered relationships -> Before filtering: 1. After: 1[0m


In [6]:
answer = format_answer(answer=answer, max_words=15)
print(answer)

 
Let's dive into it!
The `depth` parameter in the `__retrieve_relationship_nodes` method controls how many levels of relationships to retrieve.

If you increase the value of `depth`, this method will recursively traverse more levels of
relationships. For example, if you set `depth=1`, it will only retrieve direct child nodes of
the input node. If you set `depth=2`, it will also retrieve the children's children (grandchildren),
and so on.
On the other hand, decreasing the value of `depth` will limit the depth of the
relationship traversal. For instance, if you set `depth=0`, it will only return the ID of
the input node itself, without any relationships. If you set `depth=-1`, it will raise a
`ValueError`.
Here's an example to illustrate this:
```
def __retrieve_relationship_nodes(self, base_id: str, node: Node, depth: int):
    # ...
    if depth == 0:
        return [node.id]  # only the input
node itself
    for id, _ in node_relationships.items():
        node_ = self._db.get(Node, 

In [8]:
query = """
how do the methods: _check_common_parent_nodes, _check_relationships_of_retrieved_nodes, return_nodes_after_apply_threshold_filter and return_nodes_with_score_after_apply_threshold_filter work together 
to improve the result of the function query_vector_database?
"""

answer, relationships, filterd_relationships = asyncio.run(
    query_pipeline(agent=multi_agent, 
                   query=query, 
                   llm=llm, 
                   db=db)
)

[34mAgent reasoning response: The subject is the function query_vector_database therefore the answer is simple because there is only one subject.[0m
[34mAgent answer: simple 
[0m
[34mAgent reasoning response: The question is not about a specific method, but how they work together to improve the result of the function query_vector_database, which refers to the subject in a general way.[0m
[34mAgent answer: general 
[0m
[32mTool decided by the agent: GeneralRetriever[0m
[34m	Exact match of subject: {'query_vector_database'} in the database. --> 1[0m
[34mScore: 1 for text: 
async def query_vector_database(request: Request, db: Session = Depends(get_db)):
    
    import psycopg2
    from pgvector.psycopg2 import register_vector
    from .database.base import SQLALCHEMY_DATABASE_URL
    import numpy as np
    
    body = await request.json()
    code = body['code']
    
[0m
[34m	Relationship 1 for node --> 3bce3521-3592-4e3f-9b8b-af9b587a424b: 
class NodeWithScore:
    
    

In [11]:
answer = format_answer(answer=answer, max_words=15)
print(answer)

 
 
A Python expert!
Let's break down how these methods work together to improve the result of the `query_vector_database`
function:
**Overview**
The `NodePostProccesor` class is designed to process and filter nodes from a database. The four
methods you mentioned are part of this processing pipeline.
**Method 1: `_check_common_parent_nodes`**
This method takes the retrieved nodes and checks for common parent nodes (methods) and files.
It returns a list of tuples containing the parent node IDs and their frequencies.
**Method 2: `_check_relationships_of_retrieved_nodes`**
This method recursively explores the relationships between nodes, starting from each node in the `retrieved_nodes`
list. For each node, it checks its relationships (i.e., child nodes) and adds them to
a list. The recursion depth is controlled by the `depth` parameter.
**Method 3: `return_nodes_after_apply_threshold_filter`**
This method simply returns the filtered nodes (`_retrieved_nodes`) that passed the threshold sco

In [12]:
for _, rel_nodes in filterd_relationships.items():
    for node in rel_nodes:
        print(node.text)

class File(Base):

    __tablename__ = "file"
    id = Column(UUID(as_uuid=True), primary_key=True, index=True, default=uuid.uuid4)
    hash = Column(String, nullable=False, index=True, unique=True)
    path = Column(String, nullable=False, index=True, unique=True)
    created_at = Column(DateTime, nullable=False, server_default=sqlalchemy.func.now(), onupdate=sqlalchemy.func.now())
    updated_at = Column(DateTime, nullable=False, server_default=sqlalchemy.func.now(), onupdate=sqlalchemy.func.now())

    nodes = relationship("Node", back_populates="file")

class Node(Base):

    __tablename__ = "node"
    id = Column(UUID(as_uuid=True), primary_key=True, index=True, default=uuid.uuid4)
    node_type = Column(Enum(NodeType), nullable=False)
    file_id = Column(UUID, ForeignKey("file.id", ondelete='CASCADE'), nullable=False)
    parent_node_id = Column(UUID(as_uuid=True), ForeignKey("node.id"), nullable=True)
    text = Column(Text, nullable=False)
    embedding_text_1536 = Column(Vect

As we can see, File-Node-NodePostProcessor class has been used for the context as additional relationships, giving the LLM a better context. 

In [5]:
query = """
How does the function upload_file_zip works?
"""

answer_without_relationships, relationships, filterd_relationships = asyncio.run(
    query_pipeline(agent=multi_agent, 
                   query=query, 
                   llm=llm, 
                   db=db, 
                   threshold=1)
)

[34mAgent reasoning response: The subject is the function upload_file_zip therefore the answer is simple because there is only one subject.[0m
[34mAgent answer: simple 
[0m
[34mAgent reasoning response: The question is regarding the function upload_file_zip itself, therefore the answer is particular[0m
[34mAgent answer: particular 
[0m
[32mTool decided by the agent: SimilarityRetriever[0m




[34mOriginal Query: 
How does the function upload_file_zip works?
[0m
[34m	Exact match of subject: upload_file_zip in the database. --> 1[0m
[34mRelationships of node retrieve: 8[0m
[34mScore: 1 for text: 
async def upload_file_zip(file: UploadFile = File(...), db: Session = Depends(get_db)):

    extract_dir = os.environ['USER_CODE_DIRECTORY']
    if not os.path.exists(extract_dir):
        # shutil.rmtree(extract_dir)
        
        os.makedirs(extract_dir, exist_ok=True)

        with open(f"{ext
[0m
[34m	Relationship 1 for node --> ffcb964e-0fd7-471a-9642-a3b20ad0ad4e: 
def _create_file_node(path: str, db: Session):
    updated_files = []
    for root, _, files in os.walk(path):
        for file in files:
            
[0m
[34m	Relationship 2 for node --> ffcb964e-0fd7-471a-9642-a3b20ad0ad4e: 
async def upload_file_zip(file: UploadFile = File(...), db: Session = Depends(get_db)):

    extract_dir = os.environ['USER_CODE_DIRECTORY']
    if no
[0m
[34m	Relationship 3 fo



[34m	Filtered relationships -> Before filtering: 8. After: 0[0m


In [6]:
formated_answer_without_rel = format_answer(answer=answer_without_relationships, max_words=15)
print(formated_answer_without_rel)

 
I'd be happy to explain how the `upload_file_zip` function works.
The function is designed to handle uploading a ZIP file and extracting its contents to
a specific directory, which is stored in an environment variable called `USER_CODE_DIRECTORY`.
Here's a step-by-step breakdown of what the function does:
1. It checks if the specified directory exists. If it doesn't, it creates it.
2. It writes the uploaded ZIP file to disk using the `file.read()` method and then
extracts its contents using the `ZipFile` class from Python's standard library.
3. After extraction, it removes the original ZIP file from disk.
Once the contents of the ZIP file are extracted, the function does some additional processing:
1. It finds all Python files (`*.py`) in the extracted directory and adds their paths
to a list called `py_files`.
2. For any other files (not `.py`), it removes them.
3. Finally, it calls two helper functions `_create_file_node` and `_create_node_relationships_file` to perform some unknow

In [16]:
query = """
How does the function upload_file_zip works?
"""

answer_with_relationships, relationships, filterd_relationships = asyncio.run(
    query_pipeline(agent=multi_agent, 
                   query=query, 
                   llm=llm, 
                   db=db, 
                   threshold=0.3)
)

[34mAgent reasoning response: The subject is the function upload_file_zip therefore the answer is simple because there is only one subject[0m
[34mAgent answer: simple 
[0m
[34mAgent reasoning response: The question is regarding the function itself, therefore the answer is particular[0m
[34mAgent answer: particular 
[0m
[32mTool decided by the agent: SimilarityRetriever[0m




[34mOriginal Query: 
How does the function upload_file_zip works?
[0m
[34m	Exact match of subject: upload_file_zip in the database. --> 1[0m
[34mRelationships of node retrieve: 8[0m
[34mScore: 1 for text: 
async def upload_file_zip(file: UploadFile = File(...), db: Session = Depends(get_db)):

    extract_dir = os.environ['USER_CODE_DIRECTORY']
    if not os.path.exists(extract_dir):
        # shutil.rmtree(extract_dir)
        
        os.makedirs(extract_dir, exist_ok=True)

        with open(f"{ext
[0m
[34m	Relationship 1 for node --> ffcb964e-0fd7-471a-9642-a3b20ad0ad4e: 
def _create_file_node(path: str, db: Session):
    updated_files = []
    for root, _, files in os.walk(path):
        for file in files:
            
[0m
[34m	Relationship 2 for node --> ffcb964e-0fd7-471a-9642-a3b20ad0ad4e: 
async def upload_file_zip(file: UploadFile = File(...), db: Session = Depends(get_db)):

    extract_dir = os.environ['USER_CODE_DIRECTORY']
    if no
[0m
[34m	Relationship 3 fo

In [17]:
formated_answer_with_rel = format_answer(answer=answer_with_relationships, 
                                         max_words=15)
print(formated_answer_with_rel)

 
Let's dive into the code and explain how `upload_file_zip` works.
The `upload_file_zip` function takes two parameters:
- `file`: This is an UploadFile object, which seems to be a file uploaded through
some kind of API or form. It has a `filename` attribute.
- `db`: This is a database session (more on this later). The `Depends(get_db)` part suggests
that this function relies on the result of calling `get_db()`.
Here's what the function does:
1. It extracts an environment variable called `USER_CODE_DIRECTORY`. If this directory doesn't exist, it creates
it.
2. It writes the uploaded file to a local file in this directory.
3. It then extracts a ZIP archive from the uploaded file (assuming it's a ZIP
file) and places its contents into the same directory.
4. Finally, it removes the original uploaded ZIP file and some other files/directories within that
directory.
Let me break down what's happening here:
- `os.environ['USER_CODE_DIRECTORY']`: This line is accessing an environment variable 

In this case the LLM has probably run out of context so the quality of the answer is lower. That is why is so important to correctly filter the relationships we want to use. 