In [7]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

from gen_ai_hub.proxy.native.openai import chat

In [None]:
import os
import tempfile
import git

IRRELEVANT_FILES = [
    "webapp/",
    "app/",
    "node_modules/",
    ".gitignore",
    "package-lock.json",
    "i18n/",
    ".vscode",
    ".env",
    "eslint",
    ".eslintrc",
    ".git/",
    "__pycache__/",
]
    
def collect_all_files(repo_url, irrelevant_files=IRRELEVANT_FILES):
    
    project_path = tempfile.mkdtemp()
    git.Repo.clone_from(repo_url, project_path)
    print(f"Cloned repository to {project_path}")
    
    relevant_files = []
    
    for root, dirs, files in os.walk(project_path):
        for file in files:
                abs_path = os.path.join(root, file)
                rel_path = os.path.relpath(abs_path, project_path)
            
                relevant_files.append({
                    "name": file,
                    "full_path": abs_path,
                    "rel_path": rel_path,
                    "format": file.split(".")[-1] if "." in file else None,
                    "is_relevant": not any(exclude in rel_path for exclude in irrelevant_files),    
                })

                print(f"Collected file: {file} from {rel_path} with relevance {not any(exclude in rel_path for exclude in irrelevant_files)}")

    return relevant_files



In [21]:
repo_files = collect_all_files("https://github.com/josegouvea24/CAP-Documentator.git")
repo_files

Cloned repository to /var/folders/9d/1rdr_39n49n113vgdr6vpndc0000gn/T/tmps033424v
Collected file: README.md from README.md with relevance True
Collected file: .gitignore from .gitignore with relevance False
Collected file: package-lock.json from package-lock.json with relevance False
Collected file: package.json from package.json with relevance True
Collected file: .env from .env with relevance False
Collected file: eslint.config.mjs from eslint.config.mjs with relevance False
Collected file: ui5-deploy.yaml from app/fiori-ui/ui5-deploy.yaml with relevance False
Collected file: .eslintrc from app/fiori-ui/.eslintrc with relevance False
Collected file: ui5.yaml from app/fiori-ui/ui5.yaml with relevance False
Collected file: README.md from app/fiori-ui/README.md with relevance False
Collected file: xs-security.json from app/fiori-ui/xs-security.json with relevance False
Collected file: .gitignore from app/fiori-ui/.gitignore with relevance False
Collected file: package-lock.json from app

[{'name': 'README.md',
  'full_path': '/var/folders/9d/1rdr_39n49n113vgdr6vpndc0000gn/T/tmps033424v/README.md',
  'rel_path': 'README.md',
  'format': 'md',
  'is_relevant': True},
 {'name': '.gitignore',
  'full_path': '/var/folders/9d/1rdr_39n49n113vgdr6vpndc0000gn/T/tmps033424v/.gitignore',
  'rel_path': '.gitignore',
  'format': 'gitignore',
  'is_relevant': False},
 {'name': 'package-lock.json',
  'full_path': '/var/folders/9d/1rdr_39n49n113vgdr6vpndc0000gn/T/tmps033424v/package-lock.json',
  'rel_path': 'package-lock.json',
  'format': 'json',
  'is_relevant': False},
 {'name': 'package.json',
  'full_path': '/var/folders/9d/1rdr_39n49n113vgdr6vpndc0000gn/T/tmps033424v/package.json',
  'rel_path': 'package.json',
  'format': 'json',
  'is_relevant': True},
 {'name': '.env',
  'full_path': '/var/folders/9d/1rdr_39n49n113vgdr6vpndc0000gn/T/tmps033424v/.env',
  'rel_path': '.env',
  'format': 'env',
  'is_relevant': False},
 {'name': 'eslint.config.mjs',
  'full_path': '/var/folders

In [22]:
def generate_cds_documentation(relevant_file_content, llm_model = "gpt-4o"):
    
    system_prompt = system_prompt = """
        You are a technical documentation assistant specialized in SAP CAP (Cloud Application Programming Model). 
        You will be provided with the full content of a CAP project, including all relevant .cds files, service implementations (.js), and metadata files.

        Your task is to generate comprehensive, structured technical documentation in **Markdown** format. The documentation must include the following clearly separated sections:

        1. **Project File Structure**  
        - Show a complete hierarchical folder and file structure of the CAP project.

        2. **CAP Application Files Overview**  
        - Present a two-column table:  
            | File Name | Description |
        - Describe the role and relevance of each application file.

        3. **Data Model Representation**  
        - Render an entity data model Markdown diagram showing:  
            - All entities, their attributes (name, type)
            - Keys
            - Associations and compositions with their cardinality and navigation

        4. **Tables, Views and Types**  
        - Present a table with 4 columns:  
            | Name | Type (Table/View/Type) | Fields (name, type, default, annotations, etc.) | Annotations | Description |

        5. **CDS Service Entity Definitions**  
        - For each CDS definition, include:
            - Name
            - CRUD operations supported (Create/Read/Update/Delete)
            - Fields with types and annotations
            - Description
            - Annotations (access control, semantics, etc.)

        6. **Function and Action Imports**  
        - List each function or action with:
            - Name and description
            - Supported operations
            - Associated entities (if any)

        7. **Event Handlers**  
        - Provide a table or list with:
            - Handler type (on/before/after)
            - Event type (create/update/delete/etc.)
            - Associated entity/function/action
            - Description of handler
            - Key logic or implementation notes
            - Helper functions used
            
        8. **Server Helper Functions**
        - List all helper functions found in "srv/" folder files with:
            - Name
            - Description
            - Parameters
            - Return type
            - Implementation description
        
        
        **Formatting & Completeness Instructions**:
        - Format the full output using Markdown headers, bullet points, and tables.
        - Include all sections, even if empty — in that case, label them with `[UNKNOWN]`.
        - Do not omit any requested section.
        - Be explicit. If a detail cannot be found or deduced, mark it as `[UNKNOWN]`.
        - Include no raw code unless required as an example under a relevant section.
        - Aim for maximum completeness, clarity, and usefulness for developers, especially in implementation descriptions.

        The documentation is intended to be rendered in tools like Microsoft Word or Markdown viewers.

        """

    user_prompt = f"""
                    Here are the CAP project files and their contents:

                    {relevant_file_content}
                    """

    
    response = chat.completions.create(
                    model=llm_model,
                    temperature=0,
                    messages=[
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_prompt}
                    ]
                )
    
    return response.choices[0].message.content

In [23]:
import os

def load_file_content(file_list, only_relevant=True):
    content = ""

    for file in file_list: 
        header = (
                    f"\n\n===== FILE: {file['name']} =====\n"
                    f"Path   : {file['rel_path']}\n"
                    f"Format : {file['format']}\n"
                )
        
        if only_relevant and not file["is_relevant"]:
            file_content = "THIS FILE'S CONTENT IS IRRELEVANT FOR CAP PROJECT DOCUMENTATION"
        else:
            try:
                with open(file["full_path"], "r", encoding="utf-8") as f:
                    file_content = f.read()
            except Exception as e:
                file_content = f"⚠️ Skipping file {file.get('rel_path', 'unknown')} due to error: {e}"

        content += f"{header}\n{file_content}\n"
    
    return content

In [24]:
generate_cds_documentation(
    relevant_file_content = load_file_content(
        repo_files, only_relevant=False
    ),
    llm_model = "gpt-4o"
)

BadRequestError: Error code: 400 - {'error': {'code': 'context_length_exceeded', 'message': "This model's maximum context length is 128000 tokens. However, your messages resulted in 326535 tokens. Please reduce the length of the messages.", 'param': 'messages', 'type': 'invalid_request_error'}}