In [3]:
import os
import tempfile
import git
from pathlib import Path, PosixPath
from collections import defaultdict

IGNORED_DIRS = {"node_modules", ".git", ".vscode", "_i18n", "dist"}
IGNORED_FILES = {".gitignore", "package-lock.json", ".env"}

def clone_repo(repo_url):
    temp_dir = tempfile.mkdtemp()
    git.Repo.clone_from(repo_url, temp_dir)
    return temp_dir

def classify_files(repo_path):
    repo_path = Path(repo_path)
    files_by_category = defaultdict(lambda: defaultdict(list))
    all_files = []

    for root, dirs, files in os.walk(repo_path):
        # Filter out ignored directories
        dirs[:] = [d for d in dirs if d not in IGNORED_DIRS]
        
        for file in files:
            # Filter out ignored files
            if file in IGNORED_FILES:
                continue
            
            path = Path(root) / file
            rel_path = path.relative_to(repo_path)
            all_files.append(rel_path)

            # Group by top-level folder (CAP convention)
            parts = rel_path.parts
            category = parts[0] if len(parts) > 1 else "root"
            files_by_category[category][path.suffix.lower()].append(rel_path)

    return dict(files_by_category)

In [4]:
temp_dir = clone_repo("https://github.tools.sap/Delivery-Scale-PT/opportunity-assistant-CAP")
files_by_type = classify_files(temp_dir)
files_by_type

{'root': defaultdict(list,
             {'': [PosixPath('core'), PosixPath('.eslintrc')],
              '.md': [PosixPath('README.md')],
              '.json': [PosixPath('xs-security.json'),
               PosixPath('package.json')],
              '.yaml': [PosixPath('mta.yaml')]}),
 'app': defaultdict(list,
             {'.cds': [PosixPath('app/services.cds'),
               PosixPath('app/opportunityassistantcockpit/annotations.cds')],
              '.yaml': [PosixPath('app/opportunityassistantcockpit/ui5-deploy.yaml'),
               PosixPath('app/opportunityassistantcockpit/ui5.yaml')],
              '.md': [PosixPath('app/opportunityassistantcockpit/README.md')],
              '.json': [PosixPath('app/opportunityassistantcockpit/package.json'),
               PosixPath('app/opportunityassistantcockpit/xs-app.json'),
               PosixPath('app/opportunityassistantcockpit/webapp/manifest.json'),
               PosixPath('app/opportunityassistantcockpit/webapp/model/sideContent.

In [None]:
files_by_type = { 'srv': defaultdict(list,
             {'.cds': [PosixPath('srv/api.cds')],
              '.js': [PosixPath('srv/server.js'),
               PosixPath('srv/AICoreUtilChatbot.js'),
               PosixPath('srv/CFUtil.js'),
               PosixPath('srv/bucketUtil.js'),
               PosixPath('srv/AICoreUtil.js'),
               PosixPath('srv/api.js'),
               PosixPath('srv/util/AxiosUtil.js'),
               PosixPath('srv/util/DataServiceUtil.js'),
               PosixPath('srv/util/sharepoint.js')]})}
files_by_type

In [None]:
from dataclasses import dataclass, field

@dataclass
class DocChunk:
    name: str             # logical name or file base name
    path: str             # relative path in the repo
    section: str          # "root", "srv", "db", etc.
    type: str             # "cds", "js", "xml", etc.
    content: str          # raw file content
    metadata: dict = field(default_factory=dict) # additional metadata

In [None]:
from pathlib import Path

def chunk_files(repo_path: Path, files_by_section: dict) -> list[DocChunk]:
    """
    Chunk files into a list of DocChunks objects with metadata.
    
    Args:
        repo_path (Path): The path to the repository.
        files_by_section (dict): A dictionary containing files grouped by section and then type.
        
    Returns:
        list: A list of DocChunks objects representing the chunked files.
    """
    chunks = []

    for section, files in files_by_section.items():
        for file_type, file_paths in files.items():
            for file_path in file_paths:
                full_path = repo_path / file_path
                try:
                    with open(full_path, 'r', encoding='utf-8') as f:
                        content = f.read()
  
                    chunk = DocChunk(
                        name=file_path.stem,
                        path=str(file_path),
                        section=section,
                        type=file_type,
                        content=content,
                        metadata={}
                    )
                    chunks.append(chunk)
                
                except Exception as e:
                    print(f"❌ Failed to read {file_path}: {e}")

    return chunks

In [None]:
chunks = chunk_files(Path(temp_dir), files_by_type)
chunks

In [None]:
def get_summary_prompt(chunk):
    name = chunk.name
    ext = chunk.type.lstrip(".")
    section = chunk.section
    path = chunk.path

    return f"""You are a documentation assistant working as part of a multi-layered AI system designed to automate the generation of technical documentation for SAP CAP (Cloud Application Programming Model) projects.

            **Context and Purpose**
            Your output will serve as an intermediate representation to help a second LLM construct comprehensive documentation for the entire CAP system. This includes API references, architecture diagrams, service guides, user manuals, and other artifacts. The final documentation must be understandable by both developers and non-technical stakeholders (e.g., project managers, clients, business analysts).

            **Your Task**
            Carefully analyze the code chunk below and produce a clear, structured description that will help the second LLM reason about the CAP system as a whole. Focus on:

            - Accurately referencing all identifiers (functions, entities, views, annotations, etc.)
            - Prioritizing CAP-specific concepts and logic (e.g., services, entities, event handlers, annotations, configurations)
            - Avoiding speculation: do **not** infer behavior or relationships that are not explicitly present in the file

            **Output Requirements**
            - Use plain, unformatted text
            - Use clear technical language
            - Describe; do not summarize
            - If the file appears incomplete, test-related, or unclear, state that honestly

            ---

            **File Details**
            - Name: {name}
            - Type: {ext}
            - Section: {section}
            - Path: {path}

            ---

            **File Content**
            {chunk.content[:4000]}

            ---
            """

In [None]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

from gen_ai_hub.proxy.native.openai import chat

def call_llm(prompt: str, model: str = "gpt-4o", temperature: float = 0) -> str:
    """
    Calls LLM with a given prompt through AI Core and returns the response.
    
    Args:
        prompt (str): The user prompt.
        model (str): LLM model name (e.g., "gpt-4o").
    
    Returns:
        str: The LLM-generated response.
    """
    
    try:
        response = chat.completions.create(
                        model=model,
                        temperature=temperature,
                        messages=[
                            {"role": "system", "content": "You are a documentation assistant."},
                            {"role": "user", "content": prompt}
                        ]
                    )
    
        return response.choices[0].message.content
    
    except Exception as e:
        print(f"❌ LLM call failed: {e}")
        return "[ERROR: failed to get LLM response]"

In [None]:
for chunk in chunks:
    prompt = get_summary_prompt(chunk)
    
    # Call the LLM with the prompt
    response = openai_llm(prompt)
    
    # Print the LLM response
    print(f"LLM Response for {chunk.name}:\n{response}\n")