# Setting UP

In [None]:
import os

print(f"Current working directory: {os.getcwd()}")
print(f"Does api_keys.env exist? {os.path.exists('./api_keys.env')}")

from dotenv import load_dotenv
loaded = load_dotenv(dotenv_path='./api_keys.env')
print(f"Was dotenv loaded successfully? {loaded}")

github_token = os.environ.get("GITHUB_TOKEN")
google_api_key = os.environ.get("GOOGLE_API_KEY")

print(f"GitHub Token: {github_token}")
print(f"Google API Key: {google_api_key}")

from dotenv import load_dotenv
import os

load_dotenv(dotenv_path='./api_keys.env')  # Specify the path to your file

GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

print(f"GitHub Token: {GITHUB_TOKEN}")
print(f"Google API Key: {GOOGLE_API_KEY}")

In [None]:
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=GOOGLE_API_KEY)
print(llm.invoke("Tell me a new joke each time").content)

# Getting Repo Structur 

In [None]:
import os
import requests
from urllib.parse import urlparse
import json
from collections import defaultdict

# folders we never want to recurse into
EXCLUDED_DIRS = {
    "node_modules", "__pycache__", ".git",
    "dist", "build", "venv", "env", ".vscode",
    "target", "vendor"
}

def parse_github_url(repo_url: str):
    path = urlparse(repo_url).path
    owner, repo = path.lstrip("/").rstrip(".git").split("/")[:2]
    return owner, repo

def fetch_repo_tree(repo_url: str):
    token = os.getenv("GITHUB_TOKEN")
    if not token:
        raise EnvironmentError("Set GITHUB_TOKEN")

    owner, repo = parse_github_url(repo_url)
    headers = {"Authorization": f"token {token}"}
    repo_api = f"https://api.github.com/repos/{owner}/{repo}"

    # 1) get default branch
    repo_info = requests.get(repo_api, headers=headers).json()
    default_branch = repo_info.get("default_branch", "main")

    # 2) get commit sha for that branch
    ref = requests.get(f"{repo_api}/git/refs/heads/{default_branch}", headers=headers).json()
    print(ref)
    sha = ref["object"]["sha"]

    # 3) fetch entire tree recursively
    tree = requests.get(f"{repo_api}/git/trees/{sha}?recursive=1", headers=headers).json().get("tree", [])
    return tree

def process_tree(tree):
    # split blobs into files_by_dir and map subdirs
    files_by_dir = defaultdict(list)
    for item in tree:
        if item["type"] != "blob":
            continue
        dirpath, fname = os.path.split(item["path"])
        dirpath = dirpath or "."
        files_by_dir[dirpath].append((fname, item.get("size",0)))

    subdirs = defaultdict(set)
    for d in files_by_dir:
        parent = os.path.dirname(d) or "."
        if d != parent:
            subdirs[parent].add(d)

    # ---- 1) build DFS JSON list ----
    dfs_list = []
    def dfs(dirpath):
        # files first
        for fname, size in sorted(files_by_dir.get(dirpath, [])):
            p = f"{dirpath}/{fname}" if dirpath!="." else fname
            dfs_list.append({"path": p, "size": size})
        # then children
        for child in sorted(subdirs.get(dirpath, [])):
            name = os.path.basename(child)
            if name in EXCLUDED_DIRS:
                dfs_list.append({"path": child + "/", "size": 0})
            else:
                dfs(child)
    dfs(".")

    # ---- 2) build `tree`-style string ----
    lines = []
    def recurse(dirpath, prefix=""):
        # gather entries: dirs first, then files
        entries = []
        for child in sorted(subdirs.get(dirpath, [])):
            entries.append(("dir", os.path.basename(child), child))
        for fname, _ in sorted(files_by_dir.get(dirpath, [])):
            entries.append(("file", fname, None))

        for idx, (etype, name, childpath) in enumerate(entries):
            last = (idx == len(entries)-1)
            connector = "└── " if last else "├── "
            suffix = "/" if etype=="dir" else ""
            lines.append(prefix + connector + name + suffix)

            if etype=="dir" and name not in EXCLUDED_DIRS:
                extension = "    " if last else "│   "
                recurse(childpath, prefix + extension)

    recurse(".")
    tree_str = "\n".join(lines)

    return dfs_list, tree_str

if __name__ == "__main__":
    repo = "https://github.com/heytamjid/dormitory-network/" #add trailing slash
    raw_tree = fetch_repo_tree(repo)
    units_json, units_tree = process_tree(raw_tree)

    # 1) output your DFS-style JSON
    print("=== JSON units (DFS order) ===")
    units = json.dumps(units_json, indent=4)
    print(units)

    # 2) output your 'tree' text
    print("\n=== ASCII tree ===")
    print(units_tree)


# Setting Up LLMs

In [None]:
# --- Standard Libraries ---
import os
import json
import re # For more robust parsing if needed

# --- Environment Variable Management ---
# Best practice: Store your API key securely, e.g., in a .env file
# Install python-dotenv: pip install python-dotenv
from dotenv import load_dotenv

# --- Langchain Core Imports (v0.1.x+ style) ---
# Ensure you have installed:
# pip install langchain langchain-google-genai google-cloud-aiplatform python-dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import HumanMessage, SystemMessage

# --- Google Generative AI Integration ---
from langchain_google_genai import ChatGoogleGenerativeAI

# ==============================================================================
# Configuration
# ==============================================================================

# --- Load Environment Variables ---
# Create a .env file in the same directory with:
# GOOGLE_API_KEY="YOUR_GEMINI_API_KEY"
load_dotenv()

# Check if the API key is loaded (optional but good practice)
if not os.getenv("GOOGLE_API_KEY"):
    print("Warning: GOOGLE_API_KEY not found in environment variables.")
    # You might want to exit or raise an error here in a real application
    # For notebook use, it might still work if google-cloud-aiplatform is configured

# --- LLM Configuration ---
# Choose the Gemini model you want to use. Check Google AI documentation for available models.
# Examples: "gemini-1.5-flash", "gemini-1.0-pro", "gemini-pro"
LLM_MODEL_NAME = "gemini-1.5-flash"
LLM_TEMPERATURE_IDENTIFY = 0.1 # Lower temp for more deterministic file selection
LLM_TEMPERATURE_GENERATE = 0.4 # Higher temp for more creative documentation generation

# --- File Paths ---
REPO_FILE_LIST_JSON = 'repo_files.json' # Path to your JSON file with file list and sizes
# !!! IMPORTANT: Set this to the base path of your *cloned* repository !!!
# This is needed for the get_file_content function below.
REPO_LOCAL_PATH = repo # <--- CHANGE THIS

# --- Documentation Categories ---
CATEGORIES = [
    "Purpose & Scope",
    "System Architecture Overview",
    "Core Components (Implementation Details)",
    "Data Model"
]

# --- Context Size Limits (to avoid exceeding LLM token limits) ---
# Adjust these based on the model and your needs
MAX_FILE_SIZE_CHARS_FOR_CONTEXT = 10000 # Max characters to read from a single file
MAX_TOTAL_CONTEXT_CHARS = 30000         # Max total characters for context per category


In [None]:
try:
    llm_identify = ChatGoogleGenerativeAI(
        model=LLM_MODEL_NAME,
        temperature=LLM_TEMPERATURE_IDENTIFY,
        convert_system_message_to_human=True # Often helpful for Gemini models
    )
except Exception as e:
    print(f"Error initializing Gemini LLM for identification: {e}")
    exit()

In [None]:
file_list_prompt_str = units
print(file_list_prompt_str)

# Initial Prompt to LLM to Mark Important Files


In [None]:
prompt_template_files = """
You are an expert software architect analyzing a GitHub repository's file structure.
Your goal is to identify the *most important and relevant* files or folders from a codebase for understanding each of the following categories.

Repository Tree of the codebase:
{file_list_tree}

Repository File Structure in DFS Manner (Path and Size) of the codebase in JSON format:
{file_list}

Based on the file paths and your general knowlege on codebase structure and sizes provided (i.e. you know that views.py, consumers.py etc contain the core business logic of a Django project), list the most relevant file paths for each category below.
Prioritize files that likely contain defining information (e.g., main entry points or core modules for components, model definitions for data model).

Categories:
1.  **Purpose & Scope**: (README, high-level documentation, main application files)
2.  **System Architecture Overview**: (Configuration files, main application/server files, core module directories, docker-compose.yml, infrastructure-as-code files, documentation about architecture, routing files )
3.  **Core Components (Implementation Details)**: (Source code directories like 'src/', 'lib/', 'app/', key modules, files with significant size suggesting core logic)
4.  **Data Model**: (Files named 'models.py', 'schemas.py', database interaction layers, ORM definitions, files in 'db/' or 'database/' folders)

Provide your answer *strictly* in the following format, listing the file paths under each category heading. Do not add any explanation before or after the list.

**Purpose & Scope:**
- path/to/relevant/file1.ext
- path/to/relevant/folder/

**System Architecture Overview:**
- path/to/relevant/file2.ext
- another/relevant/path/

**Core Components (Implementation Details):**
- path/to/core/logic.py
- src/module/

**Data Model:**
- path/to/models.py
- database/schema/
"""

file_identification_prompt = ChatPromptTemplate.from_template(prompt_template_files)

# --- Create Chain and Invoke LLM ---
chain_identify = file_identification_prompt | llm_identify | StrOutputParser()

print("Asking Gemini to identify important files...")
try:
    llm_response_files = chain_identify.invoke({"file_list": file_list_prompt_str,"file_list_tree": units_tree})
    print("\n--- Gemini Response (Important Files Raw) ---")
    print(llm_response_files)
    print("---------------------------------------------\n")
except Exception as e:
    print(f"Error invoking LLM for file identification: {e}")
    # Decide how to proceed - maybe use default files or exit
    llm_response_files = "" # Ensure variable exists

# --- Parse the LLM Response ---
identified_files_by_category = {category: [] for category in CATEGORIES}
current_category_key = None

# Map the prompt headers to the keys in our dictionary
# This makes parsing slightly more robust to minor variations
category_header_map = {
    "**Purpose & Scope:**": "Purpose & Scope",
    "**System Architecture Overview:**": "System Architecture Overview",
    "**Core Components (Implementation Details):**": "Core Components (Implementation Details)",
    "**Data Model:**": "Data Model"
}

if llm_response_files:
    lines = llm_response_files.strip().split('\n')
    for line in lines:
        line = line.strip()
        if not line: # Skip empty lines
            continue

        # Check if the line is a category header
        is_header = False
        for header, key in category_header_map.items():
            if line.startswith(header):
                current_category_key = key
                is_header = True
                break

        # If it's not a header and we have a current category, assume it's a file path
        if not is_header and current_category_key and line.startswith('-'):
            # Extract file path: remove leading '-', trim whitespace
            file_path = line[1:].strip()
            if file_path: # Avoid adding empty entries
                identified_files_by_category[current_category_key].append(file_path)
else:
    print("Warning: LLM response for file identification was empty. Cannot proceed with documentation generation.")
    # Optionally, define default files per category here as a fallback

print("--- Parsed Important Files ---")
for category, files in identified_files_by_category.items():
    print(f"Category: {category}")
    if files:
        for f in files:
            print(f"  - {f}")
    else:
        print("  (No specific files identified for this category)")
print("----------------------------\n")


# Getting Files from Github

In [None]:
import os
import re
import requests
import base64

# Cap how many characters we pull per file
MAX_FILE_SIZE_CHARS_FOR_CONTEXT = 100_000

def get_file_content(repo_base_path: str, relative_file_path: str) -> str | None:
    """
    Fetches a file’s contents from GitHub via the REST API.

    Args:
        repo_base_path (str): GitHub repo URL, e.g.
            "https://github.com/owner/repo/" or "https://github.com/owner/repo"
        relative_file_path (str): Path within the repo, e.g. "README.md" or "src/app.py"

    Returns:
        str | None: The file’s UTF-8 text (possibly truncated), or None if not found/error.
    """
    # grab your token
    token = os.getenv("GITHUB_TOKEN")
    if not token:
        print("Error: GITHUB_TOKEN environment variable is not set.")
        return None

    # normalize the URL and extract owner/repo
    repo_url = repo_base_path.rstrip("/")  # drop trailing slash if any
    m = re.match(r".*github\.com[:/](?P<owner>[^/]+)/(?P<repo>[^/]+)$", repo_url)
    if not m:
        print(f"Warning: couldn’t parse owner/repo from '{repo_base_path}'")
        return None
    owner, repo = m.group("owner"), m.group("repo")

    # build the API endpoint
    api_url = f"https://api.github.com/repos/{owner}/{repo}/contents/{relative_file_path.lstrip('/')}"
    headers = {
        "Authorization": f"token {token}",
        "Accept": "application/vnd.github.v3+json"
    }

    try:
        resp = requests.get(api_url, headers=headers, timeout=10)
        if resp.status_code == 404:
            print(f"Warning: File not found on GitHub: {relative_file_path}")
            return None
        resp.raise_for_status()
        data = resp.json()

        # make sure it really is a file
        if data.get("type") != "file" or "content" not in data:
            print(f"Skipping non-file entry: {relative_file_path}")
            return None

        # decode Base64 payload
        raw_bytes = base64.b64decode(data["content"])
        text = raw_bytes.decode("utf-8", errors="ignore")

        # truncate if too big
        if len(text) > MAX_FILE_SIZE_CHARS_FOR_CONTEXT:
            print(f"Note: Content truncated at {MAX_FILE_SIZE_CHARS_FOR_CONTEXT} chars.")
            return text[:MAX_FILE_SIZE_CHARS_FOR_CONTEXT]

        return text

    except requests.RequestException as e:
        print(f"Warning: GitHub API error fetching {relative_file_path}: {e}")
        return None
    except (ValueError, KeyError) as e:
        print(f"Warning: Unexpected API response for {relative_file_path}: {e}")
        return None


# Generating Documentation

In [None]:
# Step 2: Fetch Content and Generate Documentation using LLM
# ==============================================================================

print("\n--- Step 2: Generating Documentation from File Content ---")

generated_documentation = {}

# --- Initialize LLM for Content Generation ---
try:
    llm_generate = ChatGoogleGenerativeAI(
        model=LLM_MODEL_NAME,
        temperature=LLM_TEMPERATURE_GENERATE,
        convert_system_message_to_human=True # Often helpful for Gemini models
    )
except Exception as e:
    print(f"Error initializing Gemini LLM for generation: {e}")
    exit() # Cannot proceed without the LLM

# --- Define Prompt Templates for Each Category ---

# System message provides overall context and instructions
system_prompt_content_generation = """
You are an expert technical writer specializing in software documentation.
Your task is to generate clear, concise, and accurate documentation for a specific aspect of a GitHub repository.
You will be given relevant code snippets and file paths as context. Use *only* this provided context to formulate your response.
- Explain the requested aspect accurately based *only* on the provided snippets.
- If the context is insufficient to answer fully, state that clearly. Do not invent information.
- If asked for diagrams (like Mermaid), generate the code block correctly formatted (e.g., ```mermaid ... ```). Ensure the diagram reflects the relationships described in the text, based *only* on the provided context.
- Structure your response clearly, using Markdown formatting (headings, lists, code blocks) where appropriate.
"""

# Specific prompts for each category
prompt_templates_generate = {
    "Purpose & Scope": ChatPromptTemplate.from_messages([
        ("system", system_prompt_content_generation),
        ("human", """
        Based *only* on the following context (code snippets and file paths):
        Context:
        {context}

        ---
        Task: Provide a high-level summary of this repository's main **Purpose and Scope**.
        - What problem does it seem to solve?
        - Who might the intended users be?
        - What are its key features or functionalities suggested by the provided context?
        """)
    ]),

    "System Architecture Overview": ChatPromptTemplate.from_messages([
        ("system", system_prompt_content_generation + "\nFocus on high-level components and their interactions as suggested by the context. Use Mermaid syntax for diagrams if the context supports it."),
        ("human", """
        Based *only* on the following context (code snippets and file paths):
        Context:
        {context}

        ---
        Task: Describe the overall **System Architecture**.
        - Identify the main components suggested by the context (e.g., API, database, workers, UI, specific modules).
        - Briefly explain their likely responsibilities based on the snippets.
        - Describe how they might interact, according to the context.
        - If the context provides enough information about component relationships, generate a Mermaid diagram (e.g., using `graph TD`, `sequenceDiagram`, or `componentDiagram` within ```mermaid ... ``` tags) to visualize this. If not, state that a diagram cannot be generated from the provided context.
        """)
    ]),

    "Core Components (Implementation Details)": ChatPromptTemplate.from_messages([
        ("system", system_prompt_content_generation + "\nFocus on explaining the 'what', 'how', and 'where' of key functionalities based *only* on the provided snippets."),
        ("human", """
        Based *only* on the following context (code snippets and file paths):
        Context:
        {context}

        ---
        Task: Identify and describe the **Core Components or Modules** suggested by the context.
        For each key component or functionality hinted at in the snippets:
        1.  **What it does:** Its apparent primary function or responsibility.
        2.  **How it works:** Key logic, algorithms, or patterns visible in the snippets (briefly).
        3.  **Where it's implemented:** The specific file(s) or function(s) from the context responsible for this logic.
        Structure your answer clearly, perhaps using headings for each identified component. If the context is insufficient for a component, state that.
        """)
    ]),

    "Data Model": ChatPromptTemplate.from_messages([
        ("system", system_prompt_content_generation + "\nFocus on data structures, storage, and relationships visible in the context. Use Mermaid syntax for diagrams if appropriate and supported by the context."),
        ("human", """
        Based *only* on the following context (code snippets and file paths), paying close attention to data definitions and interactions:
        Context:
        {context}

        ---
        Task: Describe the main **Data Model** apparent in the application.
        Look for and describe (if present in the context):
        - Database schemas (tables, columns, relationships suggested by model definitions or SQL).
        - Key data structures or objects (e.g., classes, interfaces, dictionaries) used for data representation.
        - Data validation rules mentioned.
        - Interaction with data storage (e.g., ORM usage, database connection patterns).
        - If the context clearly defines primary data entities and their relationships, represent them using Mermaid syntax (e.g., ```mermaid erDiagram ... ```). If not, state that a diagram cannot be generated from the provided context.
        """)
    ])
}


# --- Generate Documentation for Each Category ---
output_parser = StrOutputParser()

for category, files in identified_files_by_category.items():
    print(f"\n--- Generating documentation for: {category} ---")
    if not files:
        print("No relevant files were identified for this category by the previous step. Skipping generation.")
        generated_documentation[category] = "No files identified as relevant by the initial analysis."
        continue

    # --- Build Context from File Contents ---
    context_str = ""
    current_total_chars = 0
    files_included_in_context = []

    print(f"Attempting to fetch content for: {', '.join(files)}")
    for file_path in files:
        if current_total_chars >= MAX_TOTAL_CONTEXT_CHARS:
            print(f"Note: Reached max total context size ({MAX_TOTAL_CONTEXT_CHARS} chars). Skipping remaining files for {category}.")
            break

        # Use the helper function to get conten
        print(REPO_LOCAL_PATH)
        print(file_path)
        content = get_file_content(REPO_LOCAL_PATH, file_path)

        if content:
            snippet = f"\n\n--- Start of content from: {file_path} ---\n"
            snippet += content
            snippet += f"\n--- End of content from: {file_path} ---\n"

            if current_total_chars + len(snippet) <= MAX_TOTAL_CONTEXT_CHARS:
                context_str += snippet
                current_total_chars += len(snippet)
                files_included_in_context.append(file_path)
                # print(f"Added content from {file_path} ({len(content)} chars)")
            else:
                # Try adding a truncated version if the full snippet exceeds the *remaining* space
                remaining_space = MAX_TOTAL_CONTEXT_CHARS - current_total_chars
                if remaining_space > 100: # Only add if there's meaningful space left
                    truncated_content = content[:remaining_space - 100] # Leave space for headers/footers
                    snippet = f"\n\n--- Start of truncated content from: {file_path} ---\n"
                    snippet += truncated_content
                    snippet += f"\n--- End of truncated content from: {file_path} ---\n"
                    context_str += snippet
                    current_total_chars += len(snippet)
                    files_included_in_context.append(f"{file_path} (truncated)")
                    print(f"Note: Added truncated content from {file_path} to fit context limit.")
                # Stop adding more files once the limit is hit, even with truncation
                print(f"Note: Reached max total context size ({MAX_TOTAL_CONTEXT_CHARS} chars). Skipping remaining files for {category}.")
                break
        # else: File not found or error message already printed by get_file_content

    if not context_str:
         print(f"Could not retrieve any content for the identified files for {category}. Skipping generation.")
         generated_documentation[category] = "Failed to retrieve content for the files identified as relevant."
         continue

    print(f"Built context for {category} using files: {', '.join(files_included_in_context)} ({current_total_chars} chars)")

    # --- Get Prompt and Create Chain ---
    prompt_template = prompt_templates_generate.get(category)
    if not prompt_template:
        print(f"Warning: No prompt template defined for category '{category}'. Skipping.")
        generated_documentation[category] = f"Internal Error: No prompt template for category."
        continue

    chain_generate = prompt_template | llm_generate | output_parser

    # --- Invoke LLM for Documentation Generation ---
    try:
        print(f"Invoking LLM to generate documentation for {category}...")
        category_documentation = chain_generate.invoke({"context": context_str})
        generated_documentation[category] = category_documentation
        print(f"--- Successfully generated documentation for: {category} ---")
        # print(category_documentation[:500] + "...") # Print start of generated doc
    except Exception as e:
        print(f"Error generating documentation for {category}: {e}")
        generated_documentation[category] = f"Error during generation: {e}"

# ==============================================================================
# Final Output
# ==============================================================================

print("\n\n" + "="*60)
print("      FINAL GENERATED REPOSITORY DOCUMENTATION")
print("="*60 + "\n")

for category, doc in generated_documentation.items():
    print(f"\n## {category}\n")
    print(doc if doc else "(No documentation generated for this category)")
    print("\n" + "-"*40 + "\n")

# --- Optional: Save to files ---
# output_dir = "generated_docs"
# os.makedirs(output_dir, exist_ok=True)
# for category, doc in generated_documentation.items():
#     filename = category.lower().replace(' ', '_').replace('&', 'and').replace('(', '').replace(')', '') + ".md"
#     filepath = os.path.join(output_dir, filename)
#     try:
#         with open(filepath, 'w', encoding='utf-8') as f:
#             f.write(f"# {category}\n\n{doc}")
#         print(f"Saved documentation for {category} to {filepath}")
#     except Exception as e:
#         print(f"Error saving documentation for {category} to {filepath}: {e}")

print("\n--- Script Finished ---")


In [None]:
print("\n\n" + "="*60)
print("      FINAL GENERATED REPOSITORY DOCUMENTATION")
print("="*60 + "\n")

for category, doc in generated_documentation.items():
    print(f"\n## {category}\n")
    print(doc if doc else "(No documentation generated for this category)")
    print("\n" + "-"*40 + "\n")