In [27]:
import os
import glob
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

In [1]:
def read_files_in_repo(repo_path, file_extensions=None):
    """
    Recursively read all files in the repo with the given file extensions.
    Returns a list of (filename, file_text).
    """
    if file_extensions is None:
        file_extensions = [".md", ".py", ".js", ".ts", ".txt", ".html", ".css","ipynb"]

    filepaths = []
    for ext in file_extensions:
        filepaths.extend(glob.glob(os.path.join(repo_path, f"**/*{ext}"), recursive=True))

    file_contents = []
    for fp in filepaths:
        try:
            with open(fp, "r", encoding="utf-8") as f:
                text = f.read()
                if text.strip():
                    file_contents.append((fp, text))
        except Exception as e:
            print(f"Could not read file {fp}: {e}")
    return file_contents

def chunk_text(text, chunk_size=500, overlap=50):
    """
    Split text into overlapping chunks of chunk_size characters (naive approach).
    """
    chunks = []
    start = 0
    while start < len(text):
        end = min(start + chunk_size, len(text))
        chunk = text[start:end]
        chunks.append(chunk)
        start += (chunk_size - overlap)
    return chunks

repo_path = "/content/community-ai"  # Change to your local path if needed

file_contents = read_files_in_repo(repo_path)
chunks_data = []
for (filename, text) in file_contents:
    for chunk in chunk_text(text, chunk_size=700, overlap=200):
        chunks_data.append({"filename": filename, "content": chunk})

print("Number of chunks:", len(chunks_data))


Number of chunks: 794


In [3]:
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding_model = SentenceTransformer(embedding_model_name)

# Create embeddings for each chunk
chunk_embeddings = []
for doc in chunks_data:
    embedding = embedding_model.encode(doc["content"])
    chunk_embeddings.append(embedding)

chunk_embeddings = np.array(chunk_embeddings)
print(chunk_embeddings.shape)  # Should be (#chunks, embedding_dim)


(794, 384)


In [10]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [18]:

d = chunk_embeddings.shape[1]  # Dimensionality of embeddings
index = faiss.IndexFlatL2(d)   # L2 distance index
index.add(chunk_embeddings)

print(f"Total embeddings indexed: {index.ntotal}")


Total embeddings indexed: 794


In [19]:
def retrieve_chunks(query, k=5):
    # Embed the query
    query_embedding = embedding_model.encode(query)
    query_embedding = np.array([query_embedding], dtype=np.float32)

    # Search
    distances, indices = index.search(query_embedding, k)

    # Grab the corresponding chunks
    retrieved = []
    for dist, idx in zip(distances[0], indices[0]):
        retrieved.append((dist, chunks_data[idx]))
    return retrieved


In [31]:
import os
from groq import Groq

# Instantiate a Groq client using your API key.
client = Groq(api_key="add your token")

def generate_answer(query, retrieved_docs):
    """
    Builds a prompt that includes the retrieved repository context and the user question.
    Then it sends this prompt to the Groq Chat API and returns the assistant’s answer.
    """
    # Combine the retrieved document chunks into a context string.
    context = ""
    for _, doc in retrieved_docs:
        context += f"\n[File: {doc['filename']}]\n{doc['content']}\n"

    # Define the conversation for Groq.
    system_prompt = ("You are a helpful assistant. Use the following repository context from its repo contents "
                     "to answer the user's question.")
    user_prompt = f"Context: {context}\nQuestion: {query}\nAnswer:"

    # Create a chat completion request using Groq.
    chat_completion = client.chat.completions.create(
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user",   "content": user_prompt}
        ],
        model="qwen-qwq-32b",  # Select a suitable model from Groq's offerings.
        # max_completion_tokens=max_tokens, use it as per need, remove it in case of reasoning model for better response
        temperature=0.5,    # Adjust temperature as needed.
        top_p=0.9,        # Adjust top_p as needed.
        stream=False
    )

    # Extract and return the answer (trimming whitespace).
    output = chat_completion.choices[0].message.content
    return output.strip()

def chat_with_github_repo(query, k=5):
    # Retrieve top-k document chunks (assumes retrieve_chunks is defined elsewhere).
    retrieved = retrieve_chunks(query, k=k)
    # Generate an answer using Groq's Chat API.
    answer = generate_answer(query, retrieved)
    return answer

# Example usage:
user_query = "can you explain the purpose of this whole repo? and what does it do."
answer = chat_with_github_repo(user_query, k=5)
reasoning_output= answer.split("</think>")[0]
print(reasoning_output)
answer_from_model=answer.split("</think>")[-1]
print("Answer to Query: ",answer_from_model)


<think>
Okay, let's try to figure out what the user is asking here. They want to know the purpose of the entire repo and what it does. The files mentioned are all under the path /content/community-ai/Data Scraping/DataExtraction.ipynb. The content snippets provided seem to be from a Confluence page or a JIRA-related document, given mentions of Atlassian, JIRA, and Confluence macros. 

Looking at the content, it's talking about a "Projects" space where they track major work on a product, specifically mentioning Mifos Forge and OpenMRS as examples. The space is meant for contributors, financial institutions, and new volunteers to collaborate, list feature needs, and find projects. The JIRA references suggest that actual tasks are tracked there as epics and user stories, while this space serves as a central hub linking to those tasks and related documents.

The user might be confused because the repo's name includes "community-ai" and "Data Scraping," but the files' content is about proje

In [32]:
user_query = "what does the android-client do in context of this repo?"
answer = chat_with_github_repo(user_query, k=5)
answer=answer.split("</think>")[-1]
print(answer)



The **Android Client** in the context of this repository refers to a **chatbot tool** designed to assist with queries related to the **Android-Client project** of the Mifos system. Here's a breakdown of its role and context:

### Key Details:
1. **Purpose**:
   - The Android Client component in this repository is a **Q&A chatbot** (built using Gradio) that allows users to ask questions about the **Kotlin code** of the Mifos Android app.
   - It enables developers or users to query the codebase of the Android app (e.g., "How does a specific feature work?") and receive answers based on the code's documentation, comments, or structure.

2. **Technical Implementation**:
   - The chatbot is defined in the file [`android-client_bot.ipynb`](/content/community-ai/Android-Client/android-client_bot.ipynb), where it uses a `gr.Interface` (Gradio) to create a web-based interface.
   - It processes questions via the `answer_question_with_parent_docs` function, which likely leverages vector embedd