In [1]:
# First install dependencies
!pip install faiss-cpu cohere==2.9.1 GitPython==3.1.31 numpy tqdm
!pip install langchain_cohere

Collecting cohere==2.9.1
  Using cached cohere-2.9.1-cp310-cp310-linux_x86_64.whl
Installing collected packages: cohere
  Attempting uninstall: cohere
    Found existing installation: cohere 4.36
    Uninstalling cohere-4.36:
      Successfully uninstalled cohere-4.36
Successfully installed cohere-2.9.1


In [19]:
import os
from git import Repo
from tqdm import tqdm
import shutil
import numpy as np
import faiss
from typing import List, Dict, Any
from google.colab import userdata
from langchain_cohere import CohereEmbeddings
from langchain.llms import Cohere

class SecureAgentRAG:
    def __init__(self, cohere_api_key: str):
        """Initialize the RAG system"""
        self.embeddings = CohereEmbeddings(
            model="embed-english-v3.0",
            cohere_api_key=cohere_api_key
            # input_type="search_document"
        )

        self.query_embeddings = CohereEmbeddings(
            model="embed-english-v3.0",
            cohere_api_key=cohere_api_key
            # input_type="search_query"
        )

        self.llm = Cohere(cohere_api_key=cohere_api_key)
        self.dimension = 1024
        self.index = faiss.IndexFlatL2(self.dimension)
        self.texts = []
        self.metadata = []

    def clone_repository(self, local_path: str = "./SecureAgent"):
        """Clone the SecureAgent repository"""
        repo_url = "https://github.com/CoderAgent/SecureAgent.git"
        try:
            if os.path.exists(local_path):
                shutil.rmtree(local_path)
            Repo.clone_from(repo_url, local_path)
            return local_path
        except Exception as e:
            raise Exception(f"Failed to clone SecureAgent: {str(e)}")

    def get_file_content(self, file_path: str) -> str:
        """Read and return file content"""
        encodings = ['utf-8', 'latin-1', 'cp1252']

        for encoding in encodings:
            try:
                with open(file_path, 'r', encoding=encoding) as file:
                    return file.read()
            except UnicodeDecodeError:
                continue
        return ""

    def process_files(
        self,
        local_path: str,
        ignored_dirs: List[str] = ['.git', '__pycache__', 'venv', 'env'],
        file_extensions: List[str] = ['.py', '.md', '.txt', '.json', '.yaml', '.yml']
    ) -> List[Dict[str, str]]:
        """Process repository files"""
        documents = []

        for root, dirs, files in os.walk(local_path):
            dirs[:] = [d for d in dirs if d not in ignored_dirs]

            for file in files:
                if any(file.endswith(ext) for ext in file_extensions):
                    file_path = os.path.join(root, file)
                    relative_path = os.path.relpath(file_path, local_path)
                    content = self.get_file_content(file_path)

                    if content:
                        documents.append({
                            "content": content,
                            "metadata": {
                                "file_path": relative_path,
                                "file_type": os.path.splitext(file)[1]
                            }
                        })

        return documents

    def create_chunks(self, documents: List[Dict[str, str]], chunk_size: int = 1000) -> List[Dict[str, Any]]:
        """Split documents into chunks"""
        chunks = []

        for doc in documents:
            content = doc["content"]
            lines = content.split('\n')
            current_chunk = []
            current_length = 0

            for line in lines:
                if current_length + len(line) > chunk_size and current_chunk:
                    chunk_text = '\n'.join(current_chunk)
                    chunks.append({
                        "text": chunk_text,
                        "metadata": doc["metadata"]
                    })
                    current_chunk = []
                    current_length = 0

                current_chunk.append(line)
                current_length += len(line)

            if current_chunk:
                chunk_text = '\n'.join(current_chunk)
                chunks.append({
                    "text": chunk_text,
                    "metadata": doc["metadata"]
                })

        return chunks

    def add_to_index(self, chunks: List[Dict[str, Any]]):
        """Create embeddings and add to FAISS index"""
        batch_size = 50

        for i in tqdm(range(0, len(chunks), batch_size)):
            batch = chunks[i:i + batch_size]
            texts = [chunk["text"] for chunk in batch]

            try:
                embeddings = self.embeddings.embed_documents(texts)

                self.index.add(np.array(embeddings))
                self.texts.extend(texts)
                self.metadata.extend([chunk["metadata"] for chunk in batch])

            except Exception as e:
                print(f"Error processing batch {i}: {str(e)}")

    def query(self, question: str, top_k: int = 3) -> str:
        """Query the RAG system"""
        try:
            query_embedding = self.query_embeddings.embed_query(question)

            if len(self.texts) == 0:
                return "No indexed content available to search."

            D, I = self.index.search(np.array([query_embedding]), min(top_k, len(self.texts)))

            context = "\n\n".join([
                f"[From {self.metadata[i]['file_path']}]:\n{self.texts[i]}"
                for i in I[0]
            ])

            response = self.llm.predict(
                f"""Given the following code context, answer the question.

Context:
{context}

Question: {question}

Answer:"""
            )

            return response.strip()

        except Exception as e:
            print(f"Query error: {str(e)}")
            return str(e)

    def process_repository(self, local_path: str = "./SecureAgent"):
        """Main function to process repository"""
        try:
            print("Cloning repository...")
            self.clone_repository(local_path)

            print("Processing files...")
            documents = self.process_files(local_path)

            print("Creating chunks...")
            chunks = self.create_chunks(documents)

            print("Creating embeddings and index...")
            self.add_to_index(chunks)

            print("Ready for queries!")

        except Exception as e:
            print(f"Error processing repository: {str(e)}")
            raise

In [20]:
# Get API key
from google.colab import userdata
COHERE_API_KEY = userdata.get('COHERE_API_KEY')

# Initialize and run
rag = SecureAgentRAG(cohere_api_key=COHERE_API_KEY)
rag.process_repository()

# Example queries
questions = [
    "What security measures are implemented in the agent?",
    "How does the system handle untrusted input?",
    "What logging mechanisms are in place?"
]

for question in questions:
    print(f"\nQuestion: {question}")
    answer = rag.query(question)
    print(f"Answer: {answer}")
    print("-" * 80)

Cloning repository...
Processing files...
Creating chunks...
Creating embeddings and index...


100%|██████████| 4/4 [00:05<00:00,  1.37s/it]
  response = self.llm.predict(


Ready for queries!

Question: What security measures are implemented in the agent?
Answer: The secure-agent you referenced is a Node.js implementation of a GitHub API agent that adds additional authentication headers to the requests it makes to the GitHub API. This agent can be used to authenticate requests to the GitHub API, but it does not implement any security measures beyond what the GitHub API itself provides. 

The agent has various dependencies which are also used to enhance it and these have undergone regular updates to ensure they are secure and free from vulnerabilities. 

In terms of security measures implemented, the node engine is also strictly monitored and only versions 8 and above are allowed as per the engine requirements. This is to ensure that the agent does not run on older, unsupported nodes which may have vulnerabilities. 

Other than this, the package does not explicitly state additional security measures it implements and primarily focuses on API authentication

In [None]:
# Interactive query loop
while True:
    question = input("\nEnter your question (or 'quit' to exit): ")
    if question.lower() == 'quit':
        break

    try:
        answer = rag.query(question)
        print(f"\nAnswer: {answer}")
    except Exception as e:
        print(f"Error: {str(e)}")


Enter your question (or 'quit' to exit): What are some ways we could improve the repo?

Answer: Here are some ways we could potentially improve the AI Review Agent repo: 

1. Add more detailed instructions, including specific terminal commands or code snippets where appropriate, especially for less experienced developers, and those new to Git and GitHub. 

2. Provide better error handling and troubleshooting guidelines: Include a systematic way to handle errors and exceptions with clear and concise error messages that provide developers with actionable insights. Error messages should help users understand what went wrong and how they can fix it. 

3. Include a polishing phase to the AI review agent: Allows reviewers to accept default suggestions, extract key information from the PR description and apply any relevant code formatting to their reviews. 

4. Expand the functionality of the AI Review Agent: There are already intimations in the README.md file that the AI agent is capable of