In [25]:
import os
import shutil
import asyncio
import aiohttp
import aiofiles
import time
from pathlib import Path
from typing import Set, List, Optional, Dict, Any, Tuple
import json

PROCESSED_FILES_RECORD = "processed_files.txt"

sources = {
    "OpenRouter": {
        "base_url": "https://openrouter.ai/api/v1",
        "model": "qwen/qwen-2.5-coder-32b-instruct:free",
        "api_key": "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"
    },
    "Groq": {
        "base_url": "https://api.groq.com/openai/v1/",
        "model": "qwen-2.5-coder-32b",
        "api_key": "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"
    },
    "Groq2": {
        "base_url": "https://api.groq.com/openai/v1/",
        "model": "qwen-2.5-coder-32b",
        "api_key": "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"
    },
    "X.ai": {
        "base_url": "https://api.x.ai/v1",
        "model": "grok-2-latest",
        "api_key": "xai-Lggu94vl22xIj5ThXFVu9nxbXjqYOqbTtLlxnibCypTkA5F8N4SvT0SDAYKIK8EkQCuVHv3tbfOPGiwt"
    },
    "Ollama": {
        "base_url": "http://localhost:11434",
        "model": "qwen2.5-coder",
        "api_key": None
    }
}

# Set the selected source
source = "Groq"

# Base configuration
BASE_CONFIG = {
    "source_directories": [r"C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\components\admin"],
    "skip_directories": [
        r"C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\apis",
        r"C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\i18n",
    ],
    "selected_pass": "combined",
    "requests_per_minute": 5,
    "concurrent_requests": 3,
    "max_context_tokens": 6000,
    "skip_long_files": True,  # New option to skip files exceeding max_context_tokens
    "temperature": 0.6,
    "top_p": 0.95,
    "file_extensions": [".py", ".ts", ".svelte"],
    "retry_sleep_seconds": 30,
    "prompt_templates": {
        "optimize": (
            "Optimize the {file_type} code while preserving core functionality:\n"
            "- Remove all i18n references (e.g., t(), $t, i18n.*) unless tied to core logic\n"
            "- Eliminate unused variables, imports, and dead code after verifying no external dependencies\n"
            "- Refactor repeated logic into reusable functions or components with descriptive names (e.g., calculateTotal, not calc)\n"
            "- Simplify code by:\n"
            "  - Using early returns to reduce nesting\n"
            "  - Replacing complex conditionals with clearer alternatives where possible\n"
            "- Improve performance by minimizing loops and redundant operations\n"
            "- Use {file_type}-specific best practices (e.g., async/await in .ts, reactive statements in .svelte)\n"
            "- Naming: variables/functions in camelCase, classes/components in PascalCase\n"
            "- Keep lines between 80-100 characters; break logically if longer\n"
            "- No empty lines between imports; one empty line before code\n"
            "- Preserve core logic, including onMount(() => {{) in .svelte files\n"
            "- Define core functionality as: primary operations the script performs, excluding optional features\n"
            "- Be decisive: apply the most effective optimization without alternatives\n"
            "Input code:\n{content}\n\n"
            "Return only the modified code without explanations or additional text."
        ),
        "comment_cleanup": (
            "Process the {file_type} code while preserving all functionality:\n"
            "- Remove all existing comments (e.g., //, #, /* */)\n"
            "- Add precise documentation as follows:\n"
            "  - For .py files: add a triple-quoted docstring ('''') to every function and top-level script block, including:\n"
            "    - Purpose: one sentence describing what it does\n"
            "    - Parameters: list each parameter with type and purpose\n"
            "    - Returns: describe the return value and type\n"
            "  - For .ts/.svelte files: add JSDoc comments (/** */) above every function and component, including:\n"
            "    - Purpose: one sentence describing what it does\n"
            "    - @param {{type}} name - purpose of each parameter\n"
            "    - @returns {{type}} - description of return value\n"
            "  - For HTML/Svelte markup: add <!-- Section: purpose --> comments to separate and describe major structural blocks (e.g., header, main content)\n"
            "- Keep comments concise: max 2 lines unless complex logic requires more\n"
            "- For complex logic, add brief inline comments (e.g., // Calculate total score)\n"
            "- No empty lines between imports; one empty line before code begins\n"
            "- Preserve core logic, including onMount(() => {{) in .svelte files\n"
            "Input code:\n{content}\n\n"
            "Return only the modified code without explanations or additional text."
        ),
        "combined": (
            "Combine the following two tasks for the {file_type} code while preserving core functionality:\n"
            "1. Optimize the code by removing all i18n references, unused variables, imports, dead code, and by refactoring repeated logic. "
            "Apply general optimizations such as eliminating unnecessary if statements when only one option exists, "
            "simplifying conditionals, and reducing redundant operations. "
            "Use descriptive naming, leverage {file_type}-specific best practices, and maintain clear control flow. "
            "Remove code relating to: ldap, updating version, oauth, haptic, mobile, channels, google drive, one drive.\n"
            "2. Clean up comments by removing all existing comments and adding clear, concise, and informative documentation. "
            "Add comments to every function: for .py files, use docstrings with purpose, parameters, and returns; "
            "for .ts/.svelte files, use JSDoc comments (purpose, @param, @returns). "
            "For HTML/markup, add structured comments to describe the layout and purpose of every section and function.\n\n"
            "Input code:\n{content}\n\n"
            "Return only the modified code without any explanations or additional text."
        )
    }
}

# Set up CONFIGS based on source
if source == "Groq":
    CONFIGS = [
        {**BASE_CONFIG, "base_url": sources["Groq"]["base_url"], "api_key": sources["Groq"]["api_key"], "model": sources["Groq"]["model"]},
        {**BASE_CONFIG, "base_url": sources["Groq2"]["base_url"], "api_key": sources["Groq2"]["api_key"], "model": sources["Groq2"]["model"]}
    ]
else:
    CONFIGS = [{**BASE_CONFIG, "base_url": sources[source]["base_url"], "api_key": sources[source]["api_key"], "model": sources[source]["model"]}]


def load_processed_files() -> Set[str]:
    """Load the list of already processed files."""
    if not os.path.exists(PROCESSED_FILES_RECORD):
        return set()
    with open(PROCESSED_FILES_RECORD, "r", encoding="utf-8") as f:
        return {line.strip() for line in f if line.strip()}


def save_processed_files(file_paths: List[str]) -> None:
    """Add processed files to the record."""
    with open(PROCESSED_FILES_RECORD, "a", encoding="utf-8") as f:
        for fp in file_paths:
            f.write(f"{fp}\n")


class RateLimiter:
    def __init__(self, requests_per_minute: int):
        self.interval = 60.0 / requests_per_minute
        self.last_request_time = 0
        self.lock = asyncio.Lock()

    async def acquire(self):
        async with self.lock:
            current_time = asyncio.get_event_loop().time()
            elapsed = current_time - self.last_request_time
            if elapsed < self.interval:
                await asyncio.sleep(self.interval - elapsed)
            self.last_request_time = asyncio.get_event_loop().time()


async def refactor_code(session: aiohttp.ClientSession, file_path: str, prompt: str, rate_limiter: RateLimiter, sem: asyncio.Semaphore, config: Dict) -> str:
    """Call the API with a given prompt and return the modified code."""
    headers = {
        "Authorization": f"Bearer {config['api_key']}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": config["model"],
        "messages": [{"role": "user", "content": prompt}],
        "temperature": config["temperature"],
        "top_p": config["top_p"],
    }
    for attempt in range(3):
        try:
            await rate_limiter.acquire()
            async with sem:
                print(f"Sending prompt to LLM for {file_path} using {config['api_key'][-4:]}")
                async with session.post(
                    f"{config['base_url']}/chat/completions",
                    json=payload,
                    headers=headers,
                    timeout=aiohttp.ClientTimeout(total=180)
                ) as response:
                    if response.status == 429:
                        error_text = await response.text()
                        error_json = json.loads(error_text)
                        if error_json.get("error", {}).get("type") == "tokens":
                            retry_after = error_json["error"].get("retry_after", config["retry_sleep_seconds"])
                            print(f"Rate limit reached for {file_path}. Retrying after {retry_after} seconds.")
                            await asyncio.sleep(retry_after)
                            continue
                        else:
                            print(f"Rate limit error {response.status} for {file_path}: {error_text}")
                            return ""

                    if response.status == 413:
                        error_text = await response.text()
                        print(f"Request too large for {file_path}: {error_text}")
                        return ""

                    if response.status != 200:
                        error_text = await response.text()
                        print(f"API error {response.status} for {file_path}: {error_text}")
                        continue

                    result = await response.json()
                    if "choices" not in result or not result["choices"]:
                        print(f"Missing 'choices' in API response for {file_path}: {result}")
                        continue

                    code = result["choices"][0]["message"]["content"].strip()
                    file_ext = Path(file_path).suffix[1:]
                    markers = [f"```{file_ext}", "```"]
                    if code.startswith(markers[0]) and code.endswith(markers[1]):
                        code = code[len(markers[0]):-len(markers[1])].strip()
                    return code or ""
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for {file_path}: {e}")
            if attempt < 2:
                await asyncio.sleep(config["retry_sleep_seconds"])
    print(f"All attempts failed for {file_path}")
    return ""


async def process_single_file(session: aiohttp.ClientSession, file_path: str, rate_limiter: RateLimiter, sem: asyncio.Semaphore, config: Dict) -> bool:
    """Process a single file using the selected pass prompt."""
    file_ext = Path(file_path).suffix[1:]
    try:
        async with aiofiles.open(file_path, "r", encoding="utf-8") as f:
            original_content = await f.read()
    except Exception as e:
        print(f"Failed to read {file_path}: {e}")
        return False

    estimated_tokens = len(original_content) // 4
    if config["skip_long_files"] and estimated_tokens > config["max_context_tokens"]:
        print(f"Skipping {file_path} ({estimated_tokens} tokens) as it exceeds max_context_tokens ({config['max_context_tokens']})")
        return False

    pass_name = config["selected_pass"]
    if pass_name not in config["prompt_templates"]:
        print(f"Selected pass '{pass_name}' not found in prompt templates for {file_path}")
        return False

    prompt = config["prompt_templates"][pass_name].format(file_type=file_ext, content=original_content)
    modified_content = await refactor_code(session, file_path, prompt, rate_limiter, sem, config)
    if not modified_content:
        print(f"Failed to process {file_path}")
        return False

    if modified_content != original_content:
        try:
            async with aiofiles.open(file_path, "w", encoding="utf-8") as f:
                await f.write(modified_content)
            print(f"Processed: {file_path}")
            return True
        except Exception as e:
            print(f"Failed to write {file_path}: {e}")
            return False
    else:
        print(f"No changes for: {file_path}")
        return False


def should_skip_file(file_path: Path, skip_dirs: List[Path]) -> bool:
    """Check if a file should be skipped based on skip directories."""
    file_path_str = str(file_path.resolve())
    for skip_dir in skip_dirs:
        skip_dir_str = str(skip_dir)
        if file_path_str.startswith(skip_dir_str) and (
            len(file_path_str) == len(skip_dir_str) or file_path_str[len(skip_dir_str)] == os.path.sep
        ):
            return True
    return False


def scan_directories() -> List[str]:
    """Scan source directories and return new file paths to process."""
    processed_files = load_processed_files()
    file_paths = []
    skip_dirs = [Path(d).resolve() for d in BASE_CONFIG["skip_directories"]]

    for directory in BASE_CONFIG["source_directories"]:
        dir_path = Path(directory).resolve()
        if not dir_path.is_dir():
            print(f"Directory not found: {directory}")
            continue
        for ext in BASE_CONFIG["file_extensions"]:
            for file_path in dir_path.glob(f"**/*{ext}"):
                file_path = file_path.resolve()
                if should_skip_file(file_path, skip_dirs) or str(file_path) in processed_files:
                    continue
                file_paths.append(str(file_path))
    print(f"Found {len(file_paths)} new files to process")
    return file_paths


def backup_files(file_paths: List[str]) -> str:
    """Backup files before processing."""
    if not file_paths:
        print("No files to backup")
        return ""
    backup_dir = f"backup_{int(time.time())}"
    os.makedirs(backup_dir, exist_ok=True)
    for fp in file_paths:
        shutil.copy(fp, os.path.join(backup_dir, os.path.basename(fp)))
    print(f"Backed up {len(file_paths)} files to {backup_dir}")
    return backup_dir


async def process_file_batch(session: aiohttp.ClientSession, file_paths: List[str], config: Dict, rate_limiter: RateLimiter, sem: asyncio.Semaphore) -> List[bool]:
    """Process a batch of files with a specific config."""
    tasks = [process_single_file(session, fp, rate_limiter, sem, config) for fp in file_paths]
    return await asyncio.gather(*tasks)


async def main(max_files_to_process: Optional[int] = None):
    if not os.path.exists(PROCESSED_FILES_RECORD):
        open(PROCESSED_FILES_RECORD, "w", encoding="utf-8").close()

    file_paths = scan_directories()
    if max_files_to_process is not None:
        file_paths = file_paths[:max_files_to_process]

    if not file_paths:
        print("No new files to process. Exiting.")
        return

    backup_dir = backup_files(file_paths)

    # Split files between endpoints if using Groq
    if source == "Groq" and len(CONFIGS) == 2:
        mid_point = len(file_paths) // 2
        file_batches = [file_paths[:mid_point], file_paths[mid_point:]]
    else:
        file_batches = [file_paths]

    rate_limiters = [RateLimiter(config["requests_per_minute"]) for config in CONFIGS]
    semaphores = [asyncio.Semaphore(config["concurrent_requests"]) for config in CONFIGS]

    connector = aiohttp.TCPConnector(limit=sum(config["concurrent_requests"] for config in CONFIGS))
    async with aiohttp.ClientSession(connector=connector) as session:
        all_results = []
        for config, files, rate_limiter, sem in zip(CONFIGS, file_batches, rate_limiters, semaphores):
            if files:
                print(f"Processing {len(files)} files with API key ending in {config['api_key'][-4:]}")
                results = await process_file_batch(session, files, config, rate_limiter, sem)
                all_results.extend(results)

        processed_files = [fp for fp, success in zip(file_paths, all_results) if success]
        save_processed_files(processed_files)
        print(f"Added {len(processed_files)} files to processed files record.")

    if backup_dir and os.path.exists(backup_dir):
        shutil.rmtree(backup_dir)
        print(f"Removed backup directory {backup_dir}")

    print(f"Processing complete. {len(processed_files)} files processed.")


In [26]:
await main(max_files_to_process=16)

Found 42 new files to process
Backed up 16 files to backup_1742383631
Processing 8 files with API key ending in Iwk5
Sending prompt to LLM for C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\components\admin\Evaluations.svelte using Iwk5
Processed: C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\components\admin\Evaluations.svelte
Sending prompt to LLM for C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\components\admin\Functions.svelte using Iwk5
Rate limit reached for C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\components\admin\Functions.svelte. Retrying after 30 seconds.
Sending prompt to LLM for C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\components\admin\Evaluations\FeedbackMenu.svelte using Iwk5
Processed: C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\components\admin\Evaluations\FeedbackMenu.svelte
Sending prompt to LLM for C:\Users\harold.noble\Desktop\RIC\app\frontend\src\lib\components\admin\Settings.svelte using Iwk5


CancelledError: 